diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -700,6 +700,9 @@
 
 - Add ISA of ``AMX-COMPLEX`` which supports ``tcmmimfp16ps`` and
   ``tcmmrlfp16ps``.
+- Support ISA of ``SM4``.
+  * Support intrinsic of ``_mm(256)_sm4key4_epi32``.
+  * Support intrinsic of ``_mm(256)_sm4rnds4_epi32``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2132,6 +2132,12 @@
 TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16128, "V8yV4f", "nV:128:", "avx512bf16,avx512vl|avxneconvert")
 TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16256, "V8yV8f", "nV:256:", "avx512bf16,avx512vl|avxneconvert")
 
+// SM4
+TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+
 TARGET_HEADER_BUILTIN(_InterlockedAnd64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedDecrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedExchange64,    "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5043,6 +5043,8 @@
 def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
 def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
 def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
+def msm4 : Flag<["-"], "msm4">, Group<m_x86_Features_Group>;
+def mno_sm4 : Flag<["-"], "mno-sm4">, Group<m_x86_Features_Group>;
 def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
 def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
 def mtsxldtrk : Flag<["-"], "mtsxldtrk">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -114,6 +114,7 @@
   bool HasSHA = false;
   bool HasSHSTK = false;
   bool HasSGX = false;
+  bool HasSM4 = false;
   bool HasCX8 = false;
   bool HasCX16 = false;
   bool HasFXSR = false;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -263,6 +263,8 @@
       HasSHA = true;
     } else if (Feature == "+shstk") {
       HasSHSTK = true;
+    } else if (Feature == "+sm4") {
+      HasSM4 = true;
     } else if (Feature == "+movbe") {
       HasMOVBE = true;
     } else if (Feature == "+sgx") {
@@ -749,6 +751,8 @@
     Builder.defineMacro("__AVX512VP2INTERSECT__");
   if (HasSHA)
     Builder.defineMacro("__SHA__");
+  if (HasSM4)
+    Builder.defineMacro("__SM4__");
 
   if (HasFXSR)
     Builder.defineMacro("__FXSR__");
@@ -1000,6 +1004,7 @@
       .Case("sgx", true)
       .Case("sha", true)
       .Case("shstk", true)
+      .Case("sm4", true)
       .Case("sse", true)
       .Case("sse2", true)
       .Case("sse3", true)
@@ -1105,6 +1110,7 @@
       .Case("sgx", HasSGX)
       .Case("sha", HasSHA)
       .Case("shstk", HasSHSTK)
+      .Case("sm4", HasSM4)
       .Case("sse", SSELevel >= SSE1)
       .Case("sse2", SSELevel >= SSE2)
       .Case("sse3", SSELevel >= SSE3)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -204,6 +204,7 @@
   serializeintrin.h
   sgxintrin.h
   shaintrin.h
+  sm4intrin.h
   smmintrin.h
   tbmintrin.h
   tmmintrin.h
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -269,6 +269,11 @@
 #include <avxneconvertintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SM4__)
+#include <sm4intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
diff --git a/clang/lib/Headers/sm4intrin.h b/clang/lib/Headers/sm4intrin.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/sm4intrin.h
@@ -0,0 +1,269 @@
+/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __SM4INTRIN_H
+#define __SM4INTRIN_H
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// _mm_sm4key4_epi32(__m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x int].
+/// \param __B
+///    A 128-bit vector of [4 x int].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE SBOX_BYTE(dword, i) {
+/// 	RETURN sbox[dword.byte[i]]
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_KEY(dword) {
+/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+/// }
+/// DEFINE T_KEY(dword) {
+/// 	RETURN L_KEY(lower_t(dword))
+/// }
+/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:128] := 0
+/// \endcode
+#define _mm_sm4key4_epi32(A, B)                                                \
+  (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x int].
+/// \param __B
+///    A 256-bit vector of [8 x int].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE SBOX_BYTE(dword, i) {
+/// 	RETURN sbox[dword.byte[i]]
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_KEY(dword) {
+/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+/// }
+/// DEFINE T_KEY(dword) {
+/// 	RETURN L_KEY(lower_t(dword))
+/// }
+/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 1
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:256] := 0
+/// \endcode
+#define _mm256_sm4key4_epi32(A, B)                                             \
+  (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
+
+/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x int].
+/// \param __B
+///    A 128-bit vector of [4 x int].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_RND(dword) {
+/// 	tmp := dword
+/// 	tmp := tmp ^ ROL32(dword, 2)
+/// 	tmp := tmp ^ ROL32(dword, 10)
+/// 	tmp := tmp ^ ROL32(dword, 18)
+/// 	tmp := tmp ^ ROL32(dword, 24)
+///   RETURN tmp
+/// }
+/// DEFINE T_RND(dword) {
+/// 	RETURN L_RND(lower_t(dword))
+/// }
+/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:128] := 0
+/// \endcode
+#define _mm_sm4rnds4_epi32(A, B)                                               \
+  (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
+
+/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x int].
+/// \param __B
+///    A 256-bit vector of [8 x int].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_RND(dword) {
+/// 	tmp := dword
+/// 	tmp := tmp ^ ROL32(dword, 2)
+/// 	tmp := tmp ^ ROL32(dword, 10)
+/// 	tmp := tmp ^ ROL32(dword, 18)
+/// 	tmp := tmp ^ ROL32(dword, 24)
+///   RETURN tmp
+/// }
+/// DEFINE T_RND(dword) {
+/// 	RETURN L_RND(lower_t(dword))
+/// }
+/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:256] := 0
+/// \endcode
+#define _mm256_sm4rnds4_epi32(A, B)                                            \
+  (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
+
+#endif // __SM4INTRIN_H
diff --git a/clang/test/CodeGen/X86/sm4-builtins.c b/clang/test/CodeGen/X86/sm4-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/X86/sm4-builtins.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m128i test_mm_sm4key4_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sm4key4_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_sm4key4_epi32(__A, __B);
+}
+
+__m256i test_mm256_sm4key4_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sm4key4_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sm4key4_epi32(__A, __B);
+}
+
+__m128i test_mm_sm4rnds4_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sm4rnds4_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_sm4rnds4_epi32(__A, __B);
+}
+
+__m256i test_mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sm4rnds4_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sm4rnds4_epi32(__A, __B);
+}
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -349,6 +349,11 @@
 // AVXNECONVERT: "-target-feature" "+avxneconvert"
 // NO-AVXNECONVERT: "-target-feature" "-avxneconvert"
 
+// RUN: %clang --target=i386 -msm4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SM4 %s
+// RUN: %clang --target=i386 -mno-sm4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SM4 %s
+// SM4: "-target-feature" "+sm4"
+// NO-SM4: "-target-feature" "-sm4"
+
 // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
 // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
 // CRC32: "-target-feature" "+crc32"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -660,6 +660,19 @@
 // AVXNECONVERTNOAVX2-NOT: #define __AVX2__ 1
 // AVXNECONVERTNOAVX2-NOT: #define __AVXNECONVERT__ 1
 
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -msm4 -x c -E -dM -o - %s | FileCheck  -check-prefix=SM4 %s
+
+// SM4: #define __AVX__ 1
+// SM4: #define __SM4__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mno-sm4 -x c -E -dM -o - %s | FileCheck  -check-prefix=NOSM4 %s
+// NOSM4-NOT: #define __SM4__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -msm4 -mno-avx -x c -E -dM -o - %s | FileCheck  -check-prefix=SM4NOAVX %s
+
+// SM4NOAVX-NOT: #define __AVX__ 1
+// SM4NOAVX-NOT: #define __SM4__ 1
+
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
 
 // CRC32: #define __CRC32__ 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -276,7 +276,7 @@
 
 * ``__builtin_unpredictable`` (unpredictable metadata in LLVM IR), is handled by X86 Backend.
   ``X86CmovConversion`` pass now respects this builtin and does not convert CMOVs to branches.
-
+* Support ISA of ``SM4``.
 
 Changes to the OCaml bindings
 -----------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5513,6 +5513,30 @@
       DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 }
 //===----------------------------------------------------------------------===//
+// SM4 intrinsics
+let TargetPrefix = "x86" in {
+  def int_x86_vsm4key4128
+      : ClangBuiltin<"__builtin_ia32_vsm4key4128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+        [llvm_v4i32_ty, llvm_v4i32_ty],
+        [IntrNoMem]>;
+  def int_x86_vsm4key4256
+      : ClangBuiltin<"__builtin_ia32_vsm4key4256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+        [llvm_v8i32_ty, llvm_v8i32_ty],
+        [IntrNoMem]>;
+  def int_x86_vsm4rnds4128
+      : ClangBuiltin<"__builtin_ia32_vsm4rnds4128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+        [llvm_v4i32_ty, llvm_v4i32_ty],
+        [IntrNoMem]>;
+  def int_x86_vsm4rnds4256
+      : ClangBuiltin<"__builtin_ia32_vsm4rnds4256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+        [llvm_v8i32_ty, llvm_v8i32_ty],
+        [IntrNoMem]>;
+}
+//===----------------------------------------------------------------------===//
 // RAO-INT intrinsics
 let TargetPrefix = "x86" in {
   def int_x86_aadd32
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -227,6 +227,7 @@
 X86_FEATURE       (AVXVNNI,         "avxvnni")
 X86_FEATURE       (AVXIFMA,         "avxifma")
 X86_FEATURE       (AVXVNNIINT8,     "avxvnniint8")
+X86_FEATURE       (SM4,             "sm4")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -242,6 +242,9 @@
 // using Shadow Stack
 def FeatureSHSTK   : SubtargetFeature<"shstk", "HasSHSTK", "true",
                        "Support CET Shadow-Stack instructions">;
+def FeatureSM4     : SubtargetFeature<"sm4", "HasSM4", "true",
+                                      "Support SM4 instructions",
+                                      [FeatureAVX]>;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -986,6 +986,7 @@
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasSHSTK     : Predicate<"Subtarget->hasSHSTK()">;
+def HasSM4       : Predicate<"Subtarget->hasSM4()">;
 def HasCLFLUSH   : Predicate<"Subtarget->hasCLFLUSH()">;
 def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCLWB      : Predicate<"Subtarget->hasCLWB()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8295,3 +8295,27 @@
                 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
                 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;
+
+// FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
+let Predicates = [HasSM4] in {
+  multiclass SM4_Base<string OpStr, RegisterClass RC, string VL,
+                      PatFrag LD, X86MemOperand MemOp> {
+    def rr : I<0xda, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
+                  RC:$src2))]>,
+               Sched<[WriteVecIMul]>;
+    def rm : I<0xda, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, MemOp:$src2),
+               !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
+                 (LD addr:$src2)))]>,
+               Sched<[WriteVecIMul]>;
+  }
+}
+
+defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
+defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
+defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
+defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1746,6 +1746,7 @@
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  Features["sm4"]        = HasLeaf7Subleaf1 && ((EAX >> 2) & 1);
   Features["raoint"]     = HasLeaf7Subleaf1 && ((EAX >> 3) & 1);
   Features["avxvnni"]    = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
   Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -613,6 +613,7 @@
 constexpr FeatureBitset ImpliedFeaturesSHA = FeatureSSE2;
 constexpr FeatureBitset ImpliedFeaturesVAES = FeatureAES | FeatureAVX;
 constexpr FeatureBitset ImpliedFeaturesVPCLMULQDQ = FeatureAVX | FeaturePCLMUL;
+constexpr FeatureBitset ImpliedFeaturesSM4 = FeatureAVX;
 
 // AVX512 features.
 constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
diff --git a/llvm/test/CodeGen/X86/sm4-intrinsics.ll b/llvm/test/CodeGen/X86/sm4-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sm4-intrinsics.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+sm4 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+sm4 | FileCheck %s
+
+define <4 x i32> @test_int_x86_vsm4key4128(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: test_int_x86_vsm4key4128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsm4key4 %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x7a,0xda,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)
+
+define <8 x i32> @test_int_x86_vsm4key4256(<8 x i32> %A, <8 x i32> %B) {
+; CHECK-LABEL: test_int_x86_vsm4key4256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsm4key4 %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7e,0xda,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)
+
+define <4 x i32> @test_int_x86_vsm4rnds4128(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: test_int_x86_vsm4rnds4128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsm4rnds4 %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x7b,0xda,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)
+
+define <8 x i32> @test_int_x86_vsm4rnds4256(<8 x i32> %A, <8 x i32> %B) {
+; CHECK-LABEL: test_int_x86_vsm4rnds4256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsm4rnds4 %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7f,0xda,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)
diff --git a/llvm/test/MC/Disassembler/X86/sm4-32.txt b/llvm/test/MC/Disassembler/X86/sm4-32.txt
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/sm4-32.txt
@@ -0,0 +1,114 @@
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:        vsm4key4 %ymm4, %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xda,0xd4
+
+# ATT:        vsm4key4 %xmm4, %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xda,0xd4
+
+# ATT:        vsm4key4  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xda,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vsm4key4  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0xda,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vsm4key4  (%eax), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0xda,0x10
+
+# ATT:        vsm4key4  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vsm4key4  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x66,0xda,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vsm4key4  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vsm4key4 ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x66,0xda,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vsm4key4  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xda,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vsm4key4  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0xda,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vsm4key4  (%eax), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0xda,0x10
+
+# ATT:        vsm4key4  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vsm4key4  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x62,0xda,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vsm4key4  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vsm4key4 xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x62,0xda,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vsm4rnds4 %ymm4, %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0xda,0xd4
+
+# ATT:        vsm4rnds4 %xmm4, %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0xda,0xd4
+
+# ATT:        vsm4rnds4  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0xda,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vsm4rnds4  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0xda,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vsm4rnds4  (%eax), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0xda,0x10
+
+# ATT:        vsm4rnds4  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vsm4rnds4  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x67,0xda,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vsm4rnds4  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vsm4rnds4 ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x67,0xda,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vsm4rnds4  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0xda,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vsm4rnds4  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0xda,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vsm4rnds4  (%eax), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0xda,0x10
+
+# ATT:        vsm4rnds4  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vsm4rnds4  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x63,0xda,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vsm4rnds4  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vsm4rnds4 xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x63,0xda,0x92,0x00,0xf8,0xff,0xff
diff --git a/llvm/test/MC/Disassembler/X86/sm4-64.txt b/llvm/test/MC/Disassembler/X86/sm4-64.txt
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/sm4-64.txt
@@ -0,0 +1,115 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vsm4key4 %ymm4, %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymm4
+0xc4,0x62,0x16,0xda,0xe4
+
+# ATT:   vsm4key4 %xmm4, %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmm4
+0xc4,0x62,0x12,0xda,0xe4
+
+# ATT:   vsm4key4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsm4key4  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0xda,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsm4key4  (%rip), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0xda,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vsm4key4  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsm4key4  4064(%rcx), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x16,0xda,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vsm4key4  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vsm4key4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x16,0xda,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vsm4key4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsm4key4  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0xda,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsm4key4  (%rip), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0xda,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vsm4key4  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsm4key4  2032(%rcx), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x12,0xda,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vsm4key4  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vsm4key4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x12,0xda,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vsm4rnds4 %ymm4, %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymm4
+0xc4,0x62,0x17,0xda,0xe4
+
+# ATT:   vsm4rnds4 %xmm4, %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmm4
+0xc4,0x62,0x13,0xda,0xe4
+
+# ATT:   vsm4rnds4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x17,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsm4rnds4  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x17,0xda,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsm4rnds4  (%rip), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x17,0xda,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vsm4rnds4  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x17,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsm4rnds4  4064(%rcx), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x17,0xda,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vsm4rnds4  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vsm4rnds4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x17,0xda,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vsm4rnds4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x13,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsm4rnds4  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x13,0xda,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsm4rnds4  (%rip), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x13,0xda,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vsm4rnds4  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x13,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsm4rnds4  2032(%rcx), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x13,0xda,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vsm4rnds4  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vsm4rnds4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x13,0xda,0xa2,0x00,0xf8,0xff,0xff
+
diff --git a/llvm/test/MC/X86/sm4-32-att.s b/llvm/test/MC/X86/sm4-32-att.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/sm4-32-att.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK:      vsm4key4 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0xd4]
+               vsm4key4 %ymm4, %ymm3, %ymm2
+
+// CHECK:      vsm4key4 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0xd4]
+               vsm4key4 %xmm4, %xmm3, %xmm2
+
+// CHECK:      vsm4key4  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4key4  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4key4  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x10]
+               vsm4key4  (%eax), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vsm4key4  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x91,0xe0,0x0f,0x00,0x00]
+               vsm4key4  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x92,0x00,0xf0,0xff,0xff]
+               vsm4key4  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vsm4key4  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4key4  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vsm4key4  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4key4  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vsm4key4  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x10]
+               vsm4key4  (%eax), %xmm3, %xmm2
+
+// CHECK:      vsm4key4  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vsm4key4  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vsm4key4  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x91,0xf0,0x07,0x00,0x00]
+               vsm4key4  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vsm4key4  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x92,0x00,0xf8,0xff,0xff]
+               vsm4key4  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0xd4]
+               vsm4rnds4 %ymm4, %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0xd4]
+               vsm4rnds4 %xmm4, %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4rnds4  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4rnds4  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x10]
+               vsm4rnds4  (%eax), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vsm4rnds4  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x91,0xe0,0x0f,0x00,0x00]
+               vsm4rnds4  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x92,0x00,0xf0,0xff,0xff]
+               vsm4rnds4  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vsm4rnds4  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4rnds4  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4rnds4  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x10]
+               vsm4rnds4  (%eax), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vsm4rnds4  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x91,0xf0,0x07,0x00,0x00]
+               vsm4rnds4  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vsm4rnds4  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x92,0x00,0xf8,0xff,0xff]
+               vsm4rnds4  -2048(%edx), %xmm3, %xmm2
+
diff --git a/llvm/test/MC/X86/sm4-32-intel.s b/llvm/test/MC/X86/sm4-32-intel.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/sm4-32-intel.s
@@ -0,0 +1,113 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0xd4]
+               vsm4key4 ymm2, ymm3, ymm4
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0xd4]
+               vsm4key4 xmm2, xmm3, xmm4
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4key4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4key4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x10]
+               vsm4key4 ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vsm4key4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x91,0xe0,0x0f,0x00,0x00]
+               vsm4key4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vsm4key4 ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xda,0x92,0x00,0xf0,0xff,0xff]
+               vsm4key4 ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4key4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4key4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x10]
+               vsm4key4 xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vsm4key4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x91,0xf0,0x07,0x00,0x00]
+               vsm4key4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vsm4key4 xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xda,0x92,0x00,0xf8,0xff,0xff]
+               vsm4key4 xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0xd4]
+               vsm4rnds4 ymm2, ymm3, ymm4
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0xd4]
+               vsm4rnds4 xmm2, xmm3, xmm4
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x10]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x91,0xe0,0x0f,0x00,0x00]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vsm4rnds4 ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x67,0xda,0x92,0x00,0xf0,0xff,0xff]
+               vsm4rnds4 ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x94,0x87,0x23,0x01,0x00,0x00]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x10]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x91,0xf0,0x07,0x00,0x00]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vsm4rnds4 xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x63,0xda,0x92,0x00,0xf8,0xff,0xff]
+               vsm4rnds4 xmm2, xmm3, xmmword ptr [edx - 2048]
diff --git a/llvm/test/MC/X86/sm4-64-att.s b/llvm/test/MC/X86/sm4-64-att.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/sm4-64-att.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+// CHECK: vsm4key4 %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xe4]
+          vsm4key4 %ymm4, %ymm13, %ymm12
+
+// CHECK: vsm4key4 %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xe4]
+          vsm4key4 %xmm4, %xmm13, %xmm12
+
+// CHECK: vsm4key4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4key4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vsm4key4  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4key4  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vsm4key4  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4key4  (%rip), %ymm13, %ymm12
+
+// CHECK: vsm4key4  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vsm4key4  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vsm4key4  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xa1,0xe0,0x0f,0x00,0x00]
+          vsm4key4  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vsm4key4  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xa2,0x00,0xf0,0xff,0xff]
+          vsm4key4  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vsm4key4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4key4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vsm4key4  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4key4  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vsm4key4  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4key4  (%rip), %xmm13, %xmm12
+
+// CHECK: vsm4key4  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vsm4key4  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vsm4key4  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xa1,0xf0,0x07,0x00,0x00]
+          vsm4key4  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vsm4key4  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xa2,0x00,0xf8,0xff,0xff]
+          vsm4key4  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4 %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xe4]
+          vsm4rnds4 %ymm4, %ymm13, %ymm12
+
+// CHECK: vsm4rnds4 %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xe4]
+          vsm4rnds4 %xmm4, %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4rnds4  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4rnds4  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4rnds4  (%rip), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vsm4rnds4  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xa1,0xe0,0x0f,0x00,0x00]
+          vsm4rnds4  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xa2,0x00,0xf0,0xff,0xff]
+          vsm4rnds4  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vsm4rnds4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4rnds4  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4rnds4  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4rnds4  (%rip), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vsm4rnds4  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xa1,0xf0,0x07,0x00,0x00]
+          vsm4rnds4  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vsm4rnds4  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xa2,0x00,0xf8,0xff,0xff]
+          vsm4rnds4  -2048(%rdx), %xmm13, %xmm12
+
diff --git a/llvm/test/MC/X86/sm4-64-intel.s b/llvm/test/MC/X86/sm4-64-intel.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/sm4-64-intel.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vsm4key4 ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xe4]
+          vsm4key4 ymm12, ymm13, ymm4
+
+// CHECK: vsm4key4 xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xe4]
+          vsm4key4 xmm12, xmm13, xmm4
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4key4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4key4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4key4 ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vsm4key4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xa1,0xe0,0x0f,0x00,0x00]
+          vsm4key4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vsm4key4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x16,0xda,0xa2,0x00,0xf0,0xff,0xff]
+          vsm4key4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4key4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4key4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4key4 xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vsm4key4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xa1,0xf0,0x07,0x00,0x00]
+          vsm4key4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vsm4key4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x12,0xda,0xa2,0x00,0xf8,0xff,0xff]
+          vsm4key4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xe4]
+          vsm4rnds4 ymm12, ymm13, ymm4
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xe4]
+          vsm4rnds4 xmm12, xmm13, xmm4
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x17,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x17,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xa1,0xe0,0x0f,0x00,0x00]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vsm4rnds4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x17,0xda,0xa2,0x00,0xf0,0xff,0xff]
+          vsm4rnds4 ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x13,0xda,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x13,0xda,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0x25,0x00,0x00,0x00,0x00]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xa1,0xf0,0x07,0x00,0x00]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vsm4rnds4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x13,0xda,0xa2,0x00,0xf8,0xff,0xff]
+          vsm4rnds4 xmm12, xmm13, xmmword ptr [rdx - 2048]
+