Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -6421,3 +6421,18 @@ def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">, Intrinsic<[], [llvm_ptr_ty], []>; } + +//===----------------------------------------------------------------------===// +// Wait and pause enhancements +let TargetPrefix = "x86" in { + def int_x86_umonitor : GCCBuiltin<"__builtin_ia32_umonitor">, + Intrinsic<[], [llvm_ptr_ty], []>; + def int_x86_umwait32 : + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_x86_umwait64 : + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_x86_tpause32 : + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_x86_tpause64 : + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; +} Index: lib/Support/Host.cpp =================================================================== --- lib/Support/Host.cpp +++ lib/Support/Host.cpp @@ -1245,6 +1245,7 @@ Features["prefetchwt1"] = HasLeaf7 && ((ECX >> 0) & 1); Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save; Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1); + Features["waitpkg"] = HasLeaf7 && ((ECX >> 5) & 1); Features["avx512vbmi2"] = HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save; Features["shstk"] = HasLeaf7 && ((ECX >> 7) & 1); Features["gfni"] = HasLeaf7 && ((ECX >> 8) & 1); Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -251,6 +251,8 @@ "Cache Line Write Back">; def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", "Support RDPID instructions">; +def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", + "Wait and pause enhancements">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands // should be avoided in favor of a MOV + register CALL/PUSH/POP. Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -889,6 +889,7 @@ def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; +def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; @@ -2671,6 +2672,39 @@ Requires<[ In64BitMode ]>; //===----------------------------------------------------------------------===// +// WAITPKG Instructions +// +let SchedRW = [WriteSystem] in { + def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src), + "umonitor\t$src", + [(int_x86_umonitor GR32:$src)]>, XS, + Requires<[HasWAITPKG, Not64BitMode]>; + def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src), + "umonitor\t$src", + [(int_x86_umonitor GR64:$src)]>, XS, + Requires<[HasWAITPKG, In64BitMode]>; + + let Uses = [ EAX, EDX ] in { + def UMWAIT32 : I<0xAE, MRM6r, + (outs), (ins GR32:$src), + "umwait\t$src", [(int_x86_umwait32 GR32:$src, EAX, EDX)]>, + XD, Requires<[HasWAITPKG, Not64BitMode]>; + def UMWAIT64 : I<0xAE, MRM6r, + (outs), (ins GR64:$src), + "umwait\t$src", [(int_x86_umwait64 GR64:$src, EAX, EDX)]>, + XD, Requires<[HasWAITPKG, In64BitMode]>; + def TPAUSE32 : I<0xAE, MRM6r, + (outs), (ins GR32:$src), + "tpause\t$src", [(int_x86_tpause32 GR32:$src, EAX, EDX)]>, + PD, Requires<[HasWAITPKG, Not64BitMode]>; + def TPAUSE64 : I<0xAE, MRM6r, + (outs), (ins GR64:$src), + "tpause\t$src", [(int_x86_tpause64 GR64:$src, EAX, EDX)]>, + PD, Requires<[HasWAITPKG, In64BitMode]>; + } +} // SchedRW + +//===----------------------------------------------------------------------===// // CLZERO Instruction // let SchedRW = [WriteSystem] in { Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -362,6 +362,9 @@ /// Processor support RDPID instruction bool HasRDPID; + /// Processor supports WaitPKG instructions + bool HasWAITPKG; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpoline; @@ -621,6 +624,7 @@ bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } bool hasRDPID() const { return HasRDPID; } + bool hasWAITPKG() const { return HasWAITPKG; } bool useRetpoline() const { return UseRetpoline; } bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -325,6 +325,7 @@ HasCLFLUSHOPT = false; HasCLWB = false; HasRDPID = false; + HasWAITPKG = false; UseRetpoline = false; UseRetpolineExternalThunk = false; IsPMULLDSlow = false; Index: test/CodeGen/X86/waitpkg-intrinsics-32.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/waitpkg-intrinsics-32.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-pc-linux -mattr=+waitpkg | FileCheck %s --check-prefix=X32 + +define void @test_umonitor(i8* %address) #0 { +; X32-LABEL: test_umonitor: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: umonitor %eax +; X32-NEXT: popl %eax +; X32-NEXT: retl +entry: + %__ADDRESS.addr.i = alloca i8*, align 8 + %address.addr = alloca i8*, align 8 + store i8* %address, i8** %address.addr, align 8 + %0 = load i8*, i8** %address.addr, align 8 + store i8* %0, i8** %__ADDRESS.addr.i, align 8 + %1 = load i8*, i8** %__ADDRESS.addr.i, align 8 + call void @llvm.x86.umonitor(i8* %1) #1 + ret void +} + +define void @test_umwait(i32 %control, i64 %counter) #0 { +; X32-LABEL: test_umwait: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $20, %esp +; X32-NEXT: .cfi_def_cfa_offset 24 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: umwait %ecx +; X32-NEXT: addl $20, %esp +; X32-NEXT: retl +entry: + %__CONTROL.addr.i = alloca i32, align 4 + %__COUNTER.addr.i = alloca i64, align 8 + %control.addr = alloca i32, align 4 + %counter.addr = alloca i64, align 8 + store i32 %control, i32* %control.addr, align 4 + store i64 %counter, i64* %counter.addr, align 8 + %0 = load i32, i32* %control.addr, align 4 + %1 = load i64, i64* %counter.addr, align 8 + store i32 %0, i32* %__CONTROL.addr.i, align 4 + store i64 %1, i64* %__COUNTER.addr.i, align 8 + %2 = load i32, i32* %__CONTROL.addr.i, align 4 + %3 = load i64, i64* %__COUNTER.addr.i, align 8 + %shr.i = lshr i64 %3, 32 + %conv.i = trunc i64 %shr.i to i32 + %4 = load i64, i64* %__COUNTER.addr.i, align 8 + %conv1.i = trunc i64 %4 to i32 + call void @llvm.x86.umwait32(i32 %2, i32 %conv.i, i32 %conv1.i) #1 + ret void +} + +define void @test_tpause(i32 %control, i64 %counter) #0 { +; X32-LABEL: test_tpause: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $20, %esp +; X32-NEXT: .cfi_def_cfa_offset 24 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: tpause %ecx +; X32-NEXT: addl $20, %esp +; X32-NEXT: retl +entry: + %__CONTROL.addr.i = alloca i32, align 4 + %__COUNTER.addr.i = alloca i64, align 8 + %control.addr = alloca i32, align 4 + %counter.addr = alloca i64, align 8 + store i32 %control, i32* %control.addr, align 4 + store i64 %counter, i64* %counter.addr, align 8 + %0 = load i32, i32* %control.addr, align 4 + %1 = load i64, i64* %counter.addr, align 8 + store i32 %0, i32* %__CONTROL.addr.i, align 4 + store i64 %1, i64* %__COUNTER.addr.i, align 8 + %2 = load i32, i32* %__CONTROL.addr.i, align 4 + %3 = load i64, i64* %__COUNTER.addr.i, align 8 + %shr.i = lshr i64 %3, 32 + %conv.i = trunc i64 %shr.i to i32 + %4 = load i64, i64* %__COUNTER.addr.i, align 8 + %conv1.i = trunc i64 %4 to i32 + call void @llvm.x86.tpause32(i32 %2, i32 %conv.i, i32 %conv1.i) #1 + ret void +} + +declare void @llvm.x86.umonitor(i8*) #1 +declare void @llvm.x86.umwait32(i32, i32, i32) #1 +declare void @llvm.x86.tpause32(i32, i32, i32) Index: test/CodeGen/X86/waitpkg-intrinsics-64.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/waitpkg-intrinsics-64.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+waitpkg | FileCheck %s --check-prefix=X64 + +define void @test_umonitor(i8* %address) #0 { +; X64-LABEL: test_umonitor: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: umonitor %rdi +; X64-NEXT: retq +entry: + %__ADDRESS.addr.i = alloca i8*, align 8 + %address.addr = alloca i8*, align 8 + store i8* %address, i8** %address.addr, align 8 + %0 = load i8*, i8** %address.addr, align 8 + store i8* %0, i8** %__ADDRESS.addr.i, align 8 + %1 = load i8*, i8** %__ADDRESS.addr.i, align 8 + call void @llvm.x86.umonitor(i8* %1) #1 + ret void +} + +define void @test_umwait(i64 %control, i64 %counter) #0 { +; X64-LABEL: test_umwait: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: umwait %rdi +; X64-NEXT: retq +entry: + %__CONTROL.addr.i = alloca i64, align 8 + %__COUNTER.addr.i = alloca i64, align 8 + %control.addr = alloca i64, align 8 + %counter.addr = alloca i64, align 8 + store i64 %control, i64* %control.addr, align 8 + store i64 %counter, i64* %counter.addr, align 8 + %0 = load i64, i64* %control.addr, align 8 + %1 = load i64, i64* %counter.addr, align 8 + store i64 %0, i64* %__CONTROL.addr.i, align 8 + store i64 %1, i64* %__COUNTER.addr.i, align 8 + %2 = load i64, i64* %__CONTROL.addr.i, align 8 + %3 = load i64, i64* %__COUNTER.addr.i, align 8 + %shr.i = lshr i64 %3, 32 + %conv.i = trunc i64 %shr.i to i32 + %4 = load i64, i64* %__COUNTER.addr.i, align 8 + %conv1.i = trunc i64 %4 to i32 + call void @llvm.x86.umwait64(i64 %2, i32 %conv.i, i32 %conv1.i) #1 + ret void +} + +define void @test_tpause(i64 %control, i64 %counter) #0 { +; X64-LABEL: test_tpause: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: tpause %rdi +; X64-NEXT: retq +entry: + %__CONTROL.addr.i = alloca i64, align 8 + %__COUNTER.addr.i = alloca i64, align 8 + %control.addr = alloca i64, align 8 + %counter.addr = alloca i64, align 8 + store i64 %control, i64* %control.addr, align 8 + store i64 %counter, i64* %counter.addr, align 8 + %0 = load i64, i64* %control.addr, align 8 + %1 = load i64, i64* %counter.addr, align 8 + store i64 %0, i64* %__CONTROL.addr.i, align 8 + store i64 %1, i64* %__COUNTER.addr.i, align 8 + %2 = load i64, i64* %__CONTROL.addr.i, align 8 + %3 = load i64, i64* %__COUNTER.addr.i, align 8 + %shr.i = lshr i64 %3, 32 + %conv.i = trunc i64 %shr.i to i32 + %4 = load i64, i64* %__COUNTER.addr.i, align 8 + %conv1.i = trunc i64 %4 to i32 + call void @llvm.x86.tpause64(i64 %2, i32 %conv.i, i32 %conv1.i) #1 + ret void +} + +declare void @llvm.x86.umonitor(i8*) +declare void @llvm.x86.umwait64(i64, i32, i32) +declare void @llvm.x86.tpause64(i64, i32, i32) Index: test/MC/Disassembler/X86/x86-32.txt =================================================================== --- test/MC/Disassembler/X86/x86-32.txt +++ test/MC/Disassembler/X86/x86-32.txt @@ -820,3 +820,12 @@ # CHECK: ptwritel %eax 0xf3 0x0f 0xae 0xe0 + +# CHECK: umonitor %eax +0xf3,0x0f,0xae,0xf0 + +# CHECK: umwait %eax +0xf2,0x0f,0xae,0xf0 + +# CHECK: tpause %eax +0x66,0x0f,0xae,0xf0 Index: test/MC/Disassembler/X86/x86-64.txt =================================================================== --- test/MC/Disassembler/X86/x86-64.txt +++ test/MC/Disassembler/X86/x86-64.txt @@ -516,3 +516,12 @@ # CHECK: ptwriteq %rax 0xf3 0x48 0x0f 0xae 0xe0 + +# CHECK: umonitor %r13 +0xf3,0x41,0x0f,0xae,0xf5 + +# CHECK: umwait %r15 +0xf2,0x41,0x0f,0xae,0xf7 + +# CHECK: tpause %r15 +0x66,0x41,0x0f,0xae,0xf7 Index: test/MC/X86/x86-32-coverage.s =================================================================== --- test/MC/X86/x86-32-coverage.s +++ test/MC/X86/x86-32-coverage.s @@ -10741,3 +10741,14 @@ // CHECK: encoding: [0xf0,0x01,0x37] lock add %esi, (%edi) +// CHECK: umonitor %eax +// CHECK: encoding: [0xf3,0x0f,0xae,0xf0] + umonitor %eax + +// CHECK: umwait %eax +// CHECK: encoding: [0xf2,0x0f,0xae,0xf0] + umwait %eax + +// CHECK: tpause %eax +// CHECK: encoding: [0x66,0x0f,0xae,0xf0] + tpause %eax Index: test/MC/X86/x86-64.s =================================================================== --- test/MC/X86/x86-64.s +++ test/MC/X86/x86-64.s @@ -1559,6 +1559,18 @@ // CHECK: encoding: [0xf3,0x48,0x0f,0xae,0xe0] ptwriteq %rax +// CHECK: umonitor %r13 +// CHECK: encoding: [0xf3,0x41,0x0f,0xae,0xf5] +umonitor %r13 + +// CHECK: umwait %r15 +// CHECK: encoding: [0xf2,0x41,0x0f,0xae,0xf7] +umwait %r15 + +// CHECK: tpause %r15 +// CHECK: encoding: [0x66,0x41,0x0f,0xae,0xf7] +tpause %r15 + // __asm __volatile( // "pushf \n\t" // "popf \n\t"