Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -38,6 +38,9 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; +def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", + "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -114,7 +117,7 @@ // def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", - "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>; + "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; @@ -270,6 +273,7 @@ FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, + FeatureRDM, FeatureZCZeroing ]>; Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -9060,7 +9060,7 @@ // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract //---------------------------------------------------------------------------- -let Predicates = [HasNEON, HasV8_1a] in { +let Predicates = [HasNEON, HasRDM] in { class BaseSIMDThreeSameVectorTiedR0 size, bits<5> opcode, RegisterOperand regtype, string asm, @@ -9221,7 +9221,7 @@ let Inst{21} = idx{0}; } } -} // let Predicates = [HasNeon, HasV8_1a] +} // let Predicates = [HasNeon, HasRDM] //---------------------------------------------------------------------------- // Crypto extensions Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -30,6 +30,8 @@ AssemblerPredicate<"FeatureLSE", "lse">; def HasRAS : Predicate<"Subtarget->hasRAS()">, AssemblerPredicate<"FeatureRAS", "ras">; +def HasRDM : Predicate<"Subtarget->hasRDM()">, + AssemblerPredicate<"FeatureRDM", "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; @@ -3284,7 +3286,7 @@ defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; -let Predicates = [HasV8_1a] in { +let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; def : Pat<(i32 (int_aarch64_neon_sqadd Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -61,6 +61,7 @@ bool HasCRC = false; bool HasLSE = false; bool HasRAS = false; + bool HasRDM = false; bool HasPerfMon = false; bool HasFullFP16 = false; bool HasSPE = false; @@ -183,6 +184,7 @@ bool hasCRC() const { return HasCRC; } bool hasLSE() const { return HasLSE; } bool hasRAS() const { return HasRAS; } + bool hasRDM() const { return HasRDM; } bool balanceFPOps() const { return BalanceFPOps; } bool predictableSelectIsExpensive() const { return PredictableSelectIsExpensive; Index: test/CodeGen/AArch64/arm64-neon-v8.1a.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-v8.1a.ll +++ test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-RDM ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple @@ -32,6 +33,7 @@ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-RDM: sqrdmlah v0.4h, v1.4h, v2.4h ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 ret <4 x i16> %retval @@ -42,6 +44,7 @@ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-RDM: sqrdmlah v0.8h, v1.8h, v2.8h ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 ret <8 x i16> %retval @@ -52,6 +55,7 @@ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-RDM: sqrdmlah v0.2s, v1.2s, v2.2s ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 ret <2 x i32> %retval @@ -61,7 +65,8 @@ ; CHECK-LABEL: test_sqrdmlah_v4i32: %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) -; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-V8: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-RDM: sqrdmlah v0.4s, v1.4s, v2.4s ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 ret <4 x i32> %retval @@ -72,6 +77,7 @@ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-RDM: sqrdmlsh v0.4h, v1.4h, v2.4h ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 ret <4 x i16> %retval @@ -82,6 +88,7 @@ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-RDM: sqrdmlsh v0.8h, v1.8h, v2.8h ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 ret <8 x i16> %retval @@ -92,6 +99,7 @@ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-RDM: sqrdmlsh v0.2s, v1.2s, v2.2s ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 ret <2 x i32> %retval @@ -102,6 +110,7 @@ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-RDM: sqrdmlsh v0.4s, v1.4s, v2.4s ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 ret <4 x i32> %retval @@ -118,6 +127,7 @@ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-RDM: sqrdmlah v0.4h, v1.4h, v2.h[3] ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] ret <4 x i16> %retval @@ -130,6 +140,7 @@ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-RDM: sqrdmlah v0.8h, v1.8h, v2.h[2] ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] ret <8 x i16> %retval @@ -142,6 +153,7 @@ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-RDM: sqrdmlah v0.2s, v1.2s, v2.s[1] ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] ret <2 x i32> %retval @@ -154,6 +166,7 @@ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-RDM: sqrdmlah v0.4s, v1.4s, v2.s[0] ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] ret <4 x i32> %retval @@ -166,6 +179,7 @@ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-RDM: sqrdmlsh v0.4h, v1.4h, v2.h[3] ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] ret <4 x i16> %retval @@ -178,6 +192,7 @@ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-RDM: sqrdmlsh v0.8h, v1.8h, v2.h[2] ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] ret <8 x i16> %retval @@ -190,6 +205,7 @@ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-RDM: sqrdmlsh v0.2s, v1.2s, v2.s[1] ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] ret <2 x i32> %retval @@ -202,6 +218,7 @@ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-RDM: sqrdmlsh v0.4s, v1.4s, v2.s[0] ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] ret <4 x i32> %retval @@ -221,6 +238,7 @@ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) %retval = extractelement <4 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-RDM: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] ret i16 %retval @@ -235,6 +253,7 @@ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) %retval = extractelement <8 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-RDM: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] ret i16 %retval @@ -248,6 +267,7 @@ %extract = extractelement <2 x i32> %prod, i64 0 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-RDM: sqrdmlah v2.2s, v0.2s, v1.s[0] ; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] ; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] ret i32 %retval @@ -261,6 +281,7 @@ %extract = extractelement <4 x i32> %prod, i64 0 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-RDM: sqrdmlah v2.4s, v0.4s, v1.s[0] ; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] ; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] ret i32 %retval @@ -275,6 +296,7 @@ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) %retval = extractelement <4 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-RDM: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] ret i16 %retval @@ -289,6 +311,7 @@ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) %retval = extractelement <8 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-RDM: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] ret i16 %retval @@ -302,6 +325,7 @@ %extract = extractelement <2 x i32> %prod, i64 0 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-RDM: sqrdmlsh v2.2s, v0.2s, v1.s[0] ; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] ; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] ret i32 %retval @@ -315,6 +339,7 @@ %extract = extractelement <4 x i32> %prod, i64 0 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-RDM: sqrdmlsh v2.4s, v0.4s, v1.s[0] ; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] ; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] ret i32 %retval @@ -333,6 +358,7 @@ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) %retval = extractelement <4 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-RDM: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ret i16 %retval @@ -347,6 +373,7 @@ %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) %retval = extractelement <4 x i32> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-RDM: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ret i32 %retval @@ -362,6 +389,7 @@ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) %retval = extractelement <4 x i16> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-RDM: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ret i16 %retval @@ -376,6 +404,7 @@ %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) %retval = extractelement <4 x i32> %retval_vec, i64 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-RDM: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ret i32 %retval @@ -385,6 +414,7 @@ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-RDM: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret i32 %retval @@ -395,6 +425,7 @@ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-RDM: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret i32 %retval @@ -414,6 +445,7 @@ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) %retval = extractelement <4 x i16> %retval_vec, i32 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] +; CHECK-RDM: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] ret i16 %retval @@ -425,6 +457,7 @@ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-RDM: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] ret i32 %retval @@ -439,6 +472,7 @@ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) %retval = extractelement <8 x i16> %retval_vec, i32 0 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] +; CHECK-RDM: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] ret i16 %retval @@ -450,6 +484,7 @@ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-RDM: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] ret i32 %retval