Index: include/llvm/IR/IntrinsicsX86.td
===================================================================
--- include/llvm/IR/IntrinsicsX86.td
+++ include/llvm/IR/IntrinsicsX86.td
@@ -6421,3 +6421,18 @@
   def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">,
       Intrinsic<[], [llvm_ptr_ty], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// Wait and pause enhancements
+let TargetPrefix = "x86" in {
+  def int_x86_umonitor : GCCBuiltin<"__builtin_ia32_umonitor">,
+              Intrinsic<[], [llvm_ptr_ty], []>;
+  def int_x86_umwait32 :
+              Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_x86_umwait64 :
+              Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_x86_tpause32 :
+              Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_x86_tpause64 :
+              Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
+}
Index: lib/Support/Host.cpp
===================================================================
--- lib/Support/Host.cpp
+++ lib/Support/Host.cpp
@@ -1245,6 +1245,7 @@
   Features["prefetchwt1"]     = HasLeaf7 && ((ECX >>  0) & 1);
   Features["avx512vbmi"]      = HasLeaf7 && ((ECX >>  1) & 1) && HasAVX512Save;
   Features["pku"]             = HasLeaf7 && ((ECX >>  4) & 1);
+  Features["waitpkg"]         = HasLeaf7 && ((ECX >>  5) & 1);
   Features["avx512vbmi2"]     = HasLeaf7 && ((ECX >>  6) & 1) && HasAVX512Save;
   Features["shstk"]           = HasLeaf7 && ((ECX >>  7) & 1);
   Features["gfni"]            = HasLeaf7 && ((ECX >>  8) & 1);
Index: lib/Target/X86/X86.td
===================================================================
--- lib/Target/X86/X86.td
+++ lib/Target/X86/X86.td
@@ -251,6 +251,8 @@
                                       "Cache Line Write Back">;
 def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
                                     "Support RDPID instructions">;
+def FeatureWAITPKG  : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
+                                      "Wait and pause enhancements">;
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
 // should be avoided in favor of a MOV + register CALL/PUSH/POP.
Index: lib/Target/X86/X86InstrInfo.td
===================================================================
--- lib/Target/X86/X86InstrInfo.td
+++ lib/Target/X86/X86InstrInfo.td
@@ -889,6 +889,7 @@
 def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCLWB      : Predicate<"Subtarget->hasCLWB()">;
 def HasRDPID     : Predicate<"Subtarget->hasRDPID()">;
+def HasWAITPKG   : Predicate<"Subtarget->hasWAITPKG()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -2671,6 +2672,39 @@
       Requires<[ In64BitMode ]>;
 
 //===----------------------------------------------------------------------===//
+// WAITPKG Instructions
+//
+let SchedRW = [WriteSystem] in {
+  def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
+                   "umonitor\t$src",
+                 [(int_x86_umonitor GR32:$src)]>, XS,
+                Requires<[HasWAITPKG, Not64BitMode]>;
+  def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
+                   "umonitor\t$src",
+                 [(int_x86_umonitor GR64:$src)]>, XS,
+                Requires<[HasWAITPKG, In64BitMode]>;
+
+  let Uses = [ EAX, EDX ] in {
+    def UMWAIT32 : I<0xAE, MRM6r,
+                     (outs), (ins GR32:$src),
+                     "umwait\t$src", [(int_x86_umwait32 GR32:$src, EAX, EDX)]>,
+                     XD, Requires<[HasWAITPKG, Not64BitMode]>;
+    def UMWAIT64 : I<0xAE, MRM6r,
+                     (outs), (ins GR64:$src),
+                     "umwait\t$src", [(int_x86_umwait64 GR64:$src, EAX, EDX)]>,
+                     XD, Requires<[HasWAITPKG, In64BitMode]>;
+    def TPAUSE32 : I<0xAE, MRM6r,
+                     (outs), (ins GR32:$src),
+                     "tpause\t$src", [(int_x86_tpause32 GR32:$src, EAX, EDX)]>,
+                     PD, Requires<[HasWAITPKG, Not64BitMode]>;
+    def TPAUSE64 : I<0xAE, MRM6r,
+                     (outs), (ins GR64:$src),
+                     "tpause\t$src", [(int_x86_tpause64 GR64:$src, EAX, EDX)]>,
+                     PD, Requires<[HasWAITPKG, In64BitMode]>;
+  }
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
 // CLZERO Instruction
 //
 let SchedRW = [WriteSystem] in {
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -362,6 +362,9 @@
   /// Processor support RDPID instruction
   bool HasRDPID;
 
+  /// Processor supports WaitPKG instructions
+  bool HasWAITPKG;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpoline;
@@ -621,6 +624,7 @@
   bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
   bool hasCLWB() const { return HasCLWB; }
   bool hasRDPID() const { return HasRDPID; }
+  bool hasWAITPKG() const { return HasWAITPKG; }
   bool useRetpoline() const { return UseRetpoline; }
   bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
 
Index: lib/Target/X86/X86Subtarget.cpp
===================================================================
--- lib/Target/X86/X86Subtarget.cpp
+++ lib/Target/X86/X86Subtarget.cpp
@@ -325,6 +325,7 @@
   HasCLFLUSHOPT = false;
   HasCLWB = false;
   HasRDPID = false;
+  HasWAITPKG = false;
   UseRetpoline = false;
   UseRetpolineExternalThunk = false;
   IsPMULLDSlow = false;
Index: test/CodeGen/X86/waitpkg-intrinsics-32.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/waitpkg-intrinsics-32.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-linux -mattr=+waitpkg | FileCheck %s --check-prefix=X32
+
+define void @test_umonitor(i8* %address) #0 {
+; X32-LABEL: test_umonitor:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    umonitor %eax
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+entry:
+  %__ADDRESS.addr.i = alloca i8*, align 8
+  %address.addr = alloca i8*, align 8
+  store i8* %address, i8** %address.addr, align 8
+  %0 = load i8*, i8** %address.addr, align 8
+  store i8* %0, i8** %__ADDRESS.addr.i, align 8
+  %1 = load i8*, i8** %__ADDRESS.addr.i, align 8
+  call void @llvm.x86.umonitor(i8* %1) #1
+  ret void
+}
+
+define void @test_umwait(i32 %control, i64 %counter) #0 {
+; X32-LABEL: test_umwait:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $20, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 24
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    umwait %ecx
+; X32-NEXT:    addl $20, %esp
+; X32-NEXT:    retl
+entry:
+  %__CONTROL.addr.i = alloca i32, align 4
+  %__COUNTER.addr.i = alloca i64, align 8
+  %control.addr = alloca i32, align 4
+  %counter.addr = alloca i64, align 8
+  store i32 %control, i32* %control.addr, align 4
+  store i64 %counter, i64* %counter.addr, align 8
+  %0 = load i32, i32* %control.addr, align 4
+  %1 = load i64, i64* %counter.addr, align 8
+  store i32 %0, i32* %__CONTROL.addr.i, align 4
+  store i64 %1, i64* %__COUNTER.addr.i, align 8
+  %2 = load i32, i32* %__CONTROL.addr.i, align 4
+  %3 = load i64, i64* %__COUNTER.addr.i, align 8
+  %shr.i = lshr i64 %3, 32
+  %conv.i = trunc i64 %shr.i to i32
+  %4 = load i64, i64* %__COUNTER.addr.i, align 8
+  %conv1.i = trunc i64 %4 to i32
+  call void @llvm.x86.umwait32(i32 %2, i32 %conv.i, i32 %conv1.i) #1
+  ret void
+}
+
+define void @test_tpause(i32 %control, i64 %counter) #0 {
+; X32-LABEL: test_tpause:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $20, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 24
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    tpause %ecx
+; X32-NEXT:    addl $20, %esp
+; X32-NEXT:    retl
+entry:
+  %__CONTROL.addr.i = alloca i32, align 4
+  %__COUNTER.addr.i = alloca i64, align 8
+  %control.addr = alloca i32, align 4
+  %counter.addr = alloca i64, align 8
+  store i32 %control, i32* %control.addr, align 4
+  store i64 %counter, i64* %counter.addr, align 8
+  %0 = load i32, i32* %control.addr, align 4
+  %1 = load i64, i64* %counter.addr, align 8
+  store i32 %0, i32* %__CONTROL.addr.i, align 4
+  store i64 %1, i64* %__COUNTER.addr.i, align 8
+  %2 = load i32, i32* %__CONTROL.addr.i, align 4
+  %3 = load i64, i64* %__COUNTER.addr.i, align 8
+  %shr.i = lshr i64 %3, 32
+  %conv.i = trunc i64 %shr.i to i32
+  %4 = load i64, i64* %__COUNTER.addr.i, align 8
+  %conv1.i = trunc i64 %4 to i32
+  call void @llvm.x86.tpause32(i32 %2, i32 %conv.i, i32 %conv1.i) #1
+  ret void
+}
+
+declare void @llvm.x86.umonitor(i8*) #1
+declare void @llvm.x86.umwait32(i32, i32, i32) #1
+declare void @llvm.x86.tpause32(i32, i32, i32)
Index: test/CodeGen/X86/waitpkg-intrinsics-64.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/waitpkg-intrinsics-64.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+waitpkg | FileCheck %s --check-prefix=X64
+
+define void @test_umonitor(i8* %address) #0 {
+; X64-LABEL: test_umonitor:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    umonitor %rdi
+; X64-NEXT:    retq
+entry:
+  %__ADDRESS.addr.i = alloca i8*, align 8
+  %address.addr = alloca i8*, align 8
+  store i8* %address, i8** %address.addr, align 8
+  %0 = load i8*, i8** %address.addr, align 8
+  store i8* %0, i8** %__ADDRESS.addr.i, align 8
+  %1 = load i8*, i8** %__ADDRESS.addr.i, align 8
+  call void @llvm.x86.umonitor(i8* %1) #1
+  ret void
+}
+
+define void @test_umwait(i64 %control, i64 %counter) #0 {
+; X64-LABEL: test_umwait:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    umwait %rdi
+; X64-NEXT:    retq
+entry:
+  %__CONTROL.addr.i = alloca i64, align 8
+  %__COUNTER.addr.i = alloca i64, align 8
+  %control.addr = alloca i64, align 8
+  %counter.addr = alloca i64, align 8
+  store i64 %control, i64* %control.addr, align 8
+  store i64 %counter, i64* %counter.addr, align 8
+  %0 = load i64, i64* %control.addr, align 8
+  %1 = load i64, i64* %counter.addr, align 8
+  store i64 %0, i64* %__CONTROL.addr.i, align 8
+  store i64 %1, i64* %__COUNTER.addr.i, align 8
+  %2 = load i64, i64* %__CONTROL.addr.i, align 8
+  %3 = load i64, i64* %__COUNTER.addr.i, align 8
+  %shr.i = lshr i64 %3, 32
+  %conv.i = trunc i64 %shr.i to i32
+  %4 = load i64, i64* %__COUNTER.addr.i, align 8
+  %conv1.i = trunc i64 %4 to i32
+  call void @llvm.x86.umwait64(i64 %2, i32 %conv.i, i32 %conv1.i) #1
+  ret void
+}
+
+define void @test_tpause(i64 %control, i64 %counter) #0 {
+; X64-LABEL: test_tpause:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    tpause %rdi
+; X64-NEXT:    retq
+entry:
+  %__CONTROL.addr.i = alloca i64, align 8
+  %__COUNTER.addr.i = alloca i64, align 8
+  %control.addr = alloca i64, align 8
+  %counter.addr = alloca i64, align 8
+  store i64 %control, i64* %control.addr, align 8
+  store i64 %counter, i64* %counter.addr, align 8
+  %0 = load i64, i64* %control.addr, align 8
+  %1 = load i64, i64* %counter.addr, align 8
+  store i64 %0, i64* %__CONTROL.addr.i, align 8
+  store i64 %1, i64* %__COUNTER.addr.i, align 8
+  %2 = load i64, i64* %__CONTROL.addr.i, align 8
+  %3 = load i64, i64* %__COUNTER.addr.i, align 8
+  %shr.i = lshr i64 %3, 32
+  %conv.i = trunc i64 %shr.i to i32
+  %4 = load i64, i64* %__COUNTER.addr.i, align 8
+  %conv1.i = trunc i64 %4 to i32
+  call void @llvm.x86.tpause64(i64 %2, i32 %conv.i, i32 %conv1.i) #1
+  ret void
+}
+
+declare void @llvm.x86.umonitor(i8*)
+declare void @llvm.x86.umwait64(i64, i32, i32)
+declare void @llvm.x86.tpause64(i64, i32, i32)
Index: test/MC/Disassembler/X86/x86-32.txt
===================================================================
--- test/MC/Disassembler/X86/x86-32.txt
+++ test/MC/Disassembler/X86/x86-32.txt
@@ -820,3 +820,12 @@
 
 # CHECK: ptwritel %eax
 0xf3 0x0f 0xae 0xe0
+
+# CHECK: umonitor %eax
+0xf3,0x0f,0xae,0xf0
+
+# CHECK: umwait %eax
+0xf2,0x0f,0xae,0xf0
+
+# CHECK: tpause %eax
+0x66,0x0f,0xae,0xf0
Index: test/MC/Disassembler/X86/x86-64.txt
===================================================================
--- test/MC/Disassembler/X86/x86-64.txt
+++ test/MC/Disassembler/X86/x86-64.txt
@@ -516,3 +516,12 @@
 
 # CHECK: ptwriteq %rax
 0xf3 0x48 0x0f 0xae 0xe0
+
+# CHECK: umonitor %r13
+0xf3,0x41,0x0f,0xae,0xf5
+
+# CHECK: umwait %r15
+0xf2,0x41,0x0f,0xae,0xf7
+
+# CHECK: tpause %r15
+0x66,0x41,0x0f,0xae,0xf7
Index: test/MC/X86/x86-32-coverage.s
===================================================================
--- test/MC/X86/x86-32-coverage.s
+++ test/MC/X86/x86-32-coverage.s
@@ -10741,3 +10741,14 @@
 // CHECK:  encoding: [0xf0,0x01,0x37]
         	lock add %esi, (%edi)
 
+// CHECK: umonitor %eax
+// CHECK:  encoding: [0xf3,0x0f,0xae,0xf0]
+	umonitor %eax
+
+// CHECK: umwait %eax
+// CHECK:  encoding: [0xf2,0x0f,0xae,0xf0]
+	umwait %eax
+
+// CHECK: tpause %eax
+// CHECK:  encoding: [0x66,0x0f,0xae,0xf0]
+	tpause %eax
Index: test/MC/X86/x86-64.s
===================================================================
--- test/MC/X86/x86-64.s
+++ test/MC/X86/x86-64.s
@@ -1559,6 +1559,18 @@
 // CHECK:  encoding: [0xf3,0x48,0x0f,0xae,0xe0]
 ptwriteq %rax
 
+// CHECK: umonitor %r13
+// CHECK:  encoding: [0xf3,0x41,0x0f,0xae,0xf5]
+umonitor %r13
+
+// CHECK: umwait %r15
+// CHECK:  encoding: [0xf2,0x41,0x0f,0xae,0xf7]
+umwait %r15
+
+// CHECK: tpause %r15
+// CHECK:  encoding: [0x66,0x41,0x0f,0xae,0xf7]
+tpause %r15
+
 //  __asm __volatile(
 //    "pushf        \n\t"
 //    "popf       \n\t"