Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -2590,6 +2590,10 @@ def mno_shstk : Flag<["-"], "mno-shstk">, Group; def mibt : Flag<["-"], "mibt">, Group; def mno_ibt : Flag<["-"], "mno-ibt">, Group; +def mretpoline : Flag<["-"], "mretpoline">, Group; +def mno_retpoline : Flag<["-"], "mno-retpoline">, Group; +def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group; +def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group; // These are legacy user-facing driver-level option spellings. They are always // aliases for options that are spelled using the more common Unix / GNU flag Index: clang/lib/Basic/Targets/X86.h =================================================================== --- clang/lib/Basic/Targets/X86.h +++ clang/lib/Basic/Targets/X86.h @@ -96,6 +96,8 @@ bool HasCLWB = false; bool HasMOVBE = false; bool HasPREFETCHWT1 = false; + bool HasRetpoline = false; + bool HasRetpolineExternalThunk = false; /// \brief Enumeration of all of the X86 CPUs supported by Clang. /// Index: clang/lib/Basic/Targets/X86.cpp =================================================================== --- clang/lib/Basic/Targets/X86.cpp +++ clang/lib/Basic/Targets/X86.cpp @@ -784,6 +784,10 @@ HasPREFETCHWT1 = true; } else if (Feature == "+clzero") { HasCLZERO = true; + } else if (Feature == "+retpoline") { + HasRetpoline = true; + } else if (Feature == "+retpoline-external-thunk") { + HasRetpolineExternalThunk = true; } X86SSEEnum Level = llvm::StringSwitch(Feature) @@ -1326,6 +1330,8 @@ .Case("prfchw", HasPRFCHW) .Case("rdrnd", HasRDRND) .Case("rdseed", HasRDSEED) + .Case("retpoline", HasRetpoline) + .Case("retpoline-external-thunk", HasRetpolineExternalThunk) .Case("rtm", HasRTM) .Case("sgx", HasSGX) .Case("sha", HasSHA) Index: clang/test/Driver/x86-target-features.c =================================================================== --- clang/test/Driver/x86-target-features.c +++ clang/test/Driver/x86-target-features.c @@ -125,3 +125,12 @@ // VBMI2: "-target-feature" "+avx512vbmi2" // NO-VBMI2: "-target-feature" "-avx512vbmi2" +// RUN: %clang -target i386-linux-gnu -mretpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=RETPOLINE %s +// RUN: %clang -target i386-linux-gnu -mno-retpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-RETPOLINE %s +// RETPOLINE: "-target-feature" "+retpoline" +// NO-RETPOLINE: "-target-feature" "-retpoline" + +// RUN: %clang -target i386-linux-gnu -mretpoline -mretpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=RETPOLINE-EXTERNAL-THUNK %s +// RUN: %clang -target i386-linux-gnu -mretpoline -mno-retpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-RETPOLINE-EXTERNAL-THUNK %s +// RETPOLINE-EXTERNAL-THUNK: "-target-feature" "+retpoline-external-thunk" +// NO-RETPOLINE-EXTERNAL-THUNK: "-target-feature" "-retpoline-external-thunk" Index: lld/ELF/Arch/X86.cpp =================================================================== --- lld/ELF/Arch/X86.cpp +++ lld/ELF/Arch/X86.cpp @@ -21,7 +21,7 @@ using namespace lld::elf; namespace { -class X86 final : public TargetInfo { +class X86 : public TargetInfo { public: X86(); RelExpr getRelExpr(RelType Type, const Symbol &S, @@ -399,7 +399,145 @@ memcpy(Loc - 2, Inst, sizeof(Inst)); } +namespace { +class RetpolinePic : public X86 { +public: + RetpolinePic(); + void writeGotPlt(uint8_t *Buf, const Symbol &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; + +class RetpolineNoPic : public X86 { +public: + RetpolineNoPic(); + void writeGotPlt(uint8_t *Buf, const Symbol &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; +} // namespace + +RetpolinePic::RetpolinePic() { + PltHeaderSize = 48; + PltEntrySize = 32; +} + +void RetpolinePic::writeGotPlt(uint8_t *Buf, const Symbol &S) const { + write32le(Buf, S.getPltVA() + 17); +} + +void RetpolinePic::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xff, 0xb3, 0, 0, 0, 0, // 0: pushl GOTPLT+4(%ebx) + 0x50, // 6: pushl %eax + 0x8b, 0x83, 0, 0, 0, 0, // 7: mov GOTPLT+8(%ebx), %eax + 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: call next + 0xf3, 0x90, // 12: loop: pause + 0x0f, 0xae, 0xe8, // 14: lfence + 0xeb, 0xf9, // 17: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16 + 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp) + 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx + 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp) + 0x89, 0xc8, // 2b: mov %ecx, %eax + 0x59, // 2d: pop %ecx + 0xc3, // 2e: ret + }; + memcpy(Buf, Insn, sizeof(Insn)); + + uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize(); + uint32_t GotPlt = InX::GotPlt->getVA() - Ebx; + write32le(Buf + 2, GotPlt + 4); + write32le(Buf + 9, GotPlt + 8); +} + +void RetpolinePic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x50, // pushl %eax + 0x8b, 0x83, 0, 0, 0, 0, // mov foo@GOT(%ebx), %eax + 0xe8, 0, 0, 0, 0, // call plt+0x20 + 0xe9, 0, 0, 0, 0, // jmp plt+0x12 + 0x68, 0, 0, 0, 0, // pushl $reloc_offset + 0xe9, 0, 0, 0, 0, // jmp plt+0 + }; + memcpy(Buf, Insn, sizeof(Insn)); + + uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize(); + write32le(Buf + 3, GotPltEntryAddr - Ebx); + write32le(Buf + 8, -Index * PltEntrySize - PltHeaderSize - 12 + 32); + write32le(Buf + 13, -Index * PltEntrySize - PltHeaderSize - 17 + 18); + write32le(Buf + 18, RelOff); + write32le(Buf + 23, -Index * PltEntrySize - PltHeaderSize - 27); +} + +RetpolineNoPic::RetpolineNoPic() { + PltHeaderSize = 64; + PltEntrySize = 32; +} + +void RetpolineNoPic::writeGotPlt(uint8_t *Buf, const Symbol &S) const { + write32le(Buf, S.getPltVA() + 16); +} + +void RetpolineNoPic::writePltHeader(uint8_t *Buf) const { + const uint8_t PltData[] = { + 0xff, 0x35, 0, 0, 0, 0, // 0: pushl GOTPLT+4 + 0x50, // 6: pushl %eax + 0xa1, 0, 0, 0, 0, // 7: mov GOTPLT+8, %eax + 0xe8, 0x0f, 0x00, 0x00, 0x00, // c: call next + 0xf3, 0x90, // 11: loop: pause + 0x0f, 0xae, 0xe8, // 13: lfence + 0xeb, 0xf9, // 16: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 18: int3 + 0xcc, 0xcc, 0xcc, // 1f: int3; .align 16 + 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp) + 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx + 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp) + 0x89, 0xc8, // 2b: mov %ecx, %eax + 0x59, // 2d: pop %ecx + 0xc3, // 2e: ret + }; + memcpy(Buf, PltData, sizeof(PltData)); + + uint32_t GotPlt = InX::GotPlt->getVA(); + write32le(Buf + 2, GotPlt + 4); + write32le(Buf + 8, GotPlt + 8); +} + +void RetpolineNoPic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x50, // 0: pushl %eax + 0xa1, 0, 0, 0, 0, // 1: mov foo_in_GOT, %eax + 0xe8, 0, 0, 0, 0, // 6: call plt+0x20 + 0xe9, 0, 0, 0, 0, // b: jmp plt+0x11 + 0x68, 0, 0, 0, 0, // 10: pushl $reloc_offset + 0xe9, 0, 0, 0, 0, // 15: jmp plt+0 + }; + memcpy(Buf, Insn, sizeof(Insn)); + + write32le(Buf + 2, GotPltEntryAddr); + write32le(Buf + 7, -Index * PltEntrySize - PltHeaderSize - 11 + 32); + write32le(Buf + 12, -Index * PltEntrySize - PltHeaderSize - 16 + 17); + write32le(Buf + 17, RelOff); + write32le(Buf + 22, -Index * PltEntrySize - PltHeaderSize - 26); +} + TargetInfo *elf::getX86TargetInfo() { - static X86 Target; - return &Target; + if (Config->ZRetpolineplt) { + if (Config->Pic) { + static RetpolinePic T; + return &T; + } + static RetpolineNoPic T; + return &T; + } + + static X86 T; + return &T; } Index: lld/ELF/Arch/X86_64.cpp =================================================================== --- lld/ELF/Arch/X86_64.cpp +++ lld/ELF/Arch/X86_64.cpp @@ -23,7 +23,7 @@ using namespace lld::elf; namespace { -template class X86_64 final : public TargetInfo { +template class X86_64 : public TargetInfo { public: X86_64(); RelExpr getRelExpr(RelType Type, const Symbol &S, @@ -460,12 +460,125 @@ write32le(Loc - 1, Val + 1); } -TargetInfo *elf::getX32TargetInfo() { - static X86_64 Target; - return &Target; +namespace { +template class Retpoline : public X86_64 { +public: + Retpoline(); + void writeGotPlt(uint8_t *Buf, const Symbol &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; + +template class RetpolineZNow : public X86_64 { +public: + RetpolineZNow(); + void writeGotPlt(uint8_t *Buf, const Symbol &S) const override {} + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; +} // namespace + +template Retpoline::Retpoline() { + TargetInfo::PltHeaderSize = 48; + TargetInfo::PltEntrySize = 32; +} + +template +void Retpoline::writeGotPlt(uint8_t *Buf, const Symbol &S) const { + write32le(Buf, S.getPltVA() + 17); +} + +template void Retpoline::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xff, 0x35, 0, 0, 0, 0, // 0: pushq GOTPLT+8(%rip) + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 6: mov GOTPLT+16(%rip), %r11 + 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: callq next + 0xf3, 0x90, // 12: loop: pause + 0x0f, 0xae, 0xe8, // 14: lfence + 0xeb, 0xf9, // 17: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16 + 0x4c, 0x89, 0x1c, 0x24, // 20: next: mov %r11, (%rsp) + 0xc3, // 24: ret + }; + memcpy(Buf, Insn, sizeof(Insn)); + + uint64_t GotPlt = InX::GotPlt->getVA(); + uint64_t Plt = InX::Plt->getVA(); + write32le(Buf + 2, GotPlt - Plt - 6 + 8); + write32le(Buf + 9, GotPlt - Plt - 13 + 16); +} + +template +void Retpoline::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0: mov foo@GOTPLT(%rip), %r11 + 0xe8, 0, 0, 0, 0, // 7: callq plt+0x20 + 0xe9, 0, 0, 0, 0, // c: jmp plt+0x12 + 0x68, 0, 0, 0, 0, // 11: pushq + 0xe9, 0, 0, 0, 0, // 16: jmp plt+0 + }; + memcpy(Buf, Insn, sizeof(Insn)); + + uint64_t Off = TargetInfo::PltHeaderSize + TargetInfo::PltEntrySize * Index; + + write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7); + write32le(Buf + 8, -Off - 12 + 32); + write32le(Buf + 13, -Off - 17 + 18); + write32le(Buf + 18, Index); + write32le(Buf + 23, -Off - 27); +} + +template RetpolineZNow::RetpolineZNow() { + TargetInfo::PltHeaderSize = 32; + TargetInfo::PltEntrySize = 16; +} + +template +void RetpolineZNow::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xe8, 0x0b, 0x00, 0x00, 0x00, // 0: call next + 0xf3, 0x90, // 5: loop: pause + 0x0f, 0xae, 0xe8, // 7: lfence + 0xeb, 0xf9, // a: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, // c: int3; .align 16 + 0x4c, 0x89, 0x1c, 0x24, // 10: next: mov %r11, (%rsp) + 0xc3, // 14: ret + }; + memcpy(Buf, Insn, sizeof(Insn)); } -TargetInfo *elf::getX86_64TargetInfo() { - static X86_64 Target; - return &Target; +template +void RetpolineZNow::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // mov foo@GOTPLT(%rip), %r11 + 0xe9, 0, 0, 0, 0, // jmp plt+0 + }; + memcpy(Buf, Insn, sizeof(Insn)); + + write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7); + write32le(Buf + 8, + -Index * TargetInfo::PltEntrySize - TargetInfo::PltHeaderSize - 12); } + +template TargetInfo *getTargetInfo() { + if (Config->ZRetpolineplt) { + if (Config->ZNow) { + static RetpolineZNow T; + return &T; + } + static Retpoline T; + return &T; + } + + static X86_64 T; + return &T; +} + +TargetInfo *elf::getX32TargetInfo() { return getTargetInfo(); } +TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo(); } Index: lld/ELF/Config.h =================================================================== --- lld/ELF/Config.h +++ lld/ELF/Config.h @@ -160,6 +160,7 @@ bool ZRelro; bool ZRodynamic; bool ZText; + bool ZRetpolineplt; bool ExitEarly; bool ZWxneeded; DiscardPolicy Discard; Index: lld/ELF/Driver.cpp =================================================================== --- lld/ELF/Driver.cpp +++ lld/ELF/Driver.cpp @@ -678,6 +678,7 @@ Config->ZNow = hasZOption(Args, "now"); Config->ZOrigin = hasZOption(Args, "origin"); Config->ZRelro = !hasZOption(Args, "norelro"); + Config->ZRetpolineplt = hasZOption(Args, "retpolineplt"); Config->ZRodynamic = hasZOption(Args, "rodynamic"); Config->ZStackSize = args::getZOptionValue(Args, OPT_z, "stack-size", 0); Config->ZText = !hasZOption(Args, "notext"); Index: lld/test/ELF/i386-retpoline-nopic.s =================================================================== --- /dev/null +++ lld/test/ELF/i386-retpoline-nopic.s @@ -0,0 +1,81 @@ +// REQUIRES: x86 +// RUN: llvm-mc -filetype=obj -triple=i386-unknown-linux %s -o %t1.o +// RUN: llvm-mc -filetype=obj -triple=i386-unknown-linux %p/Inputs/shared.s -o %t2.o +// RUN: ld.lld -shared %t2.o -o %t2.so + +// RUN: ld.lld %t1.o %t2.so -o %t.exe -z retpolineplt +// RUN: llvm-objdump -d -s %t.exe | FileCheck %s + +// CHECK: Disassembly of section .plt: +// CHECK-NEXT: .plt: +// CHECK-NEXT: 11010: ff 35 04 20 01 00 pushl 73732 +// CHECK-NEXT: 11016: 50 pushl %eax +// CHECK-NEXT: 11017: a1 08 20 01 00 movl 73736, %eax +// CHECK-NEXT: 1101c: e8 0f 00 00 00 calll 15 <.plt+0x20> +// CHECK-NEXT: 11021: f3 90 pause +// CHECK-NEXT: 11023: 0f ae e8 lfence +// CHECK-NEXT: 11026: eb f9 jmp -7 <.plt+0x11> +// CHECK-NEXT: 11028: cc int3 +// CHECK-NEXT: 11029: cc int3 +// CHECK-NEXT: 1102a: cc int3 +// CHECK-NEXT: 1102b: cc int3 +// CHECK-NEXT: 1102c: cc int3 +// CHECK-NEXT: 1102d: cc int3 +// CHECK-NEXT: 1102e: cc int3 +// CHECK-NEXT: 1102f: cc int3 +// CHECK-NEXT: 11030: 89 0c 24 movl %ecx, (%esp) +// CHECK-NEXT: 11033: 8b 4c 24 04 movl 4(%esp), %ecx +// CHECK-NEXT: 11037: 89 44 24 04 movl %eax, 4(%esp) +// CHECK-NEXT: 1103b: 89 c8 movl %ecx, %eax +// CHECK-NEXT: 1103d: 59 popl %ecx +// CHECK-NEXT: 1103e: c3 retl +// CHECK-NEXT: 1103f: cc int3 +// CHECK-NEXT: 11040: cc int3 +// CHECK-NEXT: 11041: cc int3 +// CHECK-NEXT: 11042: cc int3 +// CHECK-NEXT: 11043: cc int3 +// CHECK-NEXT: 11044: cc int3 +// CHECK-NEXT: 11045: cc int3 +// CHECK-NEXT: 11046: cc int3 +// CHECK-NEXT: 11047: cc int3 +// CHECK-NEXT: 11048: cc int3 +// CHECK-NEXT: 11049: cc int3 +// CHECK-NEXT: 1104a: cc int3 +// CHECK-NEXT: 1104b: cc int3 +// CHECK-NEXT: 1104c: cc int3 +// CHECK-NEXT: 1104d: cc int3 +// CHECK-NEXT: 1104e: cc int3 +// CHECK-NEXT: 1104f: cc int3 +// CHECK-NEXT: 11050: 50 pushl %eax +// CHECK-NEXT: 11051: a1 0c 20 01 00 movl 73740, %eax +// CHECK-NEXT: 11056: e8 d5 ff ff ff calll -43 <.plt+0x20> +// CHECK-NEXT: 1105b: e9 c1 ff ff ff jmp -63 <.plt+0x11> +// CHECK-NEXT: 11060: 68 00 00 00 00 pushl $0 +// CHECK-NEXT: 11065: e9 a6 ff ff ff jmp -90 <.plt> +// CHECK-NEXT: 1106a: cc int3 +// CHECK-NEXT: 1106b: cc int3 +// CHECK-NEXT: 1106c: cc int3 +// CHECK-NEXT: 1106d: cc int3 +// CHECK-NEXT: 1106e: cc int3 +// CHECK-NEXT: 1106f: cc int3 +// CHECK-NEXT: 11070: 50 pushl %eax +// CHECK-NEXT: 11071: a1 10 20 01 00 movl 73744, %eax +// CHECK-NEXT: 11076: e8 b5 ff ff ff calll -75 <.plt+0x20> +// CHECK-NEXT: 1107b: e9 a1 ff ff ff jmp -95 <.plt+0x11> +// CHECK-NEXT: 11080: 68 08 00 00 00 pushl $8 +// CHECK-NEXT: 11085: e9 86 ff ff ff jmp -122 <.plt> +// CHECK-NEXT: 1108a: cc int3 +// CHECK-NEXT: 1108b: cc int3 +// CHECK-NEXT: 1108c: cc int3 +// CHECK-NEXT: 1108d: cc int3 +// CHECK-NEXT: 1108e: cc int3 +// CHECK-NEXT: 1108f: cc int3 + +// CHECK: Contents of section .got.plt: +// CHECK-NEXT: 00300100 00000000 00000000 60100100 +// CHECK-NEXT: 80100100 + +.global _start +_start: + jmp bar@PLT + jmp zed@PLT Index: lld/test/ELF/i386-retpoline-pic.s =================================================================== --- /dev/null +++ lld/test/ELF/i386-retpoline-pic.s @@ -0,0 +1,62 @@ +// REQUIRES: x86 +// RUN: llvm-mc -filetype=obj -triple=i386-unknown-linux -position-independent %s -o %t1.o +// RUN: llvm-mc -filetype=obj -triple=i386-unknown-linux -position-independent %p/Inputs/shared.s -o %t2.o +// RUN: ld.lld -shared %t2.o -o %t2.so + +// RUN: ld.lld %t1.o %t2.so -o %t.exe -z retpolineplt -pie +// RUN: llvm-objdump -d -s %t.exe | FileCheck %s + +// CHECK: Disassembly of section .plt: +// CHECK-NEXT: .plt: +// CHECK-NEXT: 1010: ff b3 04 20 00 00 pushl 8196(%ebx) +// CHECK-NEXT: 1016: 50 pushl %eax +// CHECK-NEXT: 1017: 8b 83 08 20 00 00 movl 8200(%ebx), %eax +// CHECK-NEXT: 101d: e8 0e 00 00 00 calll 14 <.plt+0x20> +// CHECK-NEXT: 1022: f3 90 pause +// CHECK-NEXT: 1024: 0f ae e8 lfence +// CHECK-NEXT: 1027: eb f9 jmp -7 <.plt+0x12> +// CHECK-NEXT: 1029: cc int3 +// CHECK-NEXT: 102a: cc int3 +// CHECK-NEXT: 102b: cc int3 +// CHECK-NEXT: 102c: cc int3 +// CHECK-NEXT: 102d: cc int3 +// CHECK-NEXT: 102e: cc int3 +// CHECK-NEXT: 102f: cc int3 +// CHECK-NEXT: 1030: 89 0c 24 movl %ecx, (%esp) +// CHECK-NEXT: 1033: 8b 4c 24 04 movl 4(%esp), %ecx +// CHECK-NEXT: 1037: 89 44 24 04 movl %eax, 4(%esp) +// CHECK-NEXT: 103b: 89 c8 movl %ecx, %eax +// CHECK-NEXT: 103d: 59 popl %ecx +// CHECK-NEXT: 103e: c3 retl +// CHECK-NEXT: 103f: cc int3 +// CHECK-NEXT: 1040: 50 pushl %eax +// CHECK-NEXT: 1041: 8b 83 0c 20 00 00 movl 8204(%ebx), %eax +// CHECK-NEXT: 1047: e8 e4 ff ff ff calll -28 <.plt+0x20> +// CHECK-NEXT: 104c: e9 d1 ff ff ff jmp -47 <.plt+0x12> +// CHECK-NEXT: 1051: 68 00 00 00 00 pushl $0 +// CHECK-NEXT: 1056: e9 b5 ff ff ff jmp -75 <.plt> +// CHECK-NEXT: 105b: cc int3 +// CHECK-NEXT: 105c: cc int3 +// CHECK-NEXT: 105d: cc int3 +// CHECK-NEXT: 105e: cc int3 +// CHECK-NEXT: 105f: cc int3 +// CHECK-NEXT: 1060: 50 pushl %eax +// CHECK-NEXT: 1061: 8b 83 10 20 00 00 movl 8208(%ebx), %eax +// CHECK-NEXT: 1067: e8 c4 ff ff ff calll -60 <.plt+0x20> +// CHECK-NEXT: 106c: e9 b1 ff ff ff jmp -79 <.plt+0x12> +// CHECK-NEXT: 1071: 68 08 00 00 00 pushl $8 +// CHECK-NEXT: 1076: e9 95 ff ff ff jmp -107 <.plt> +// CHECK-NEXT: 107b: cc int3 +// CHECK-NEXT: 107c: cc int3 +// CHECK-NEXT: 107d: cc int3 +// CHECK-NEXT: 107e: cc int3 +// CHECK-NEXT: 107f: cc int3 + +// CHECK: Contents of section .got.plt: +// CHECK-NEXT: 2000 00300000 00000000 00000000 51100000 +// CHECK-NEXT: 2010 71100000 + +.global _start +_start: + jmp bar@PLT + jmp zed@PLT Index: lld/test/ELF/x86-64-retpoline-znow.s =================================================================== --- /dev/null +++ lld/test/ELF/x86-64-retpoline-znow.s @@ -0,0 +1,53 @@ +// REQUIRES: x86 +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1.o +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %p/Inputs/shared.s -o %t2.o +// RUN: ld.lld -shared %t2.o -o %t2.so + +// RUN: ld.lld -shared %t1.o %t2.so -o %t.exe -z retpolineplt -z now +// RUN: llvm-objdump -d -s %t.exe | FileCheck %s + +// CHECK: Disassembly of section .plt: +// CHECK-NEXT: .plt: +// CHECK-NEXT: 1010: e8 0b 00 00 00 callq 11 <.plt+0x10> +// CHECK-NEXT: 1015: f3 90 pause +// CHECK-NEXT: 1017: 0f ae e8 lfence +// CHECK-NEXT: 101a: eb f9 jmp -7 <.plt+0x5> +// CHECK-NEXT: 101c: cc int3 +// CHECK-NEXT: 101d: cc int3 +// CHECK-NEXT: 101e: cc int3 +// CHECK-NEXT: 101f: cc int3 +// CHECK-NEXT: 1020: 4c 89 1c 24 movq %r11, (%rsp) +// CHECK-NEXT: 1024: c3 retq +// CHECK-NEXT: 1025: cc int3 +// CHECK-NEXT: 1026: cc int3 +// CHECK-NEXT: 1027: cc int3 +// CHECK-NEXT: 1028: cc int3 +// CHECK-NEXT: 1029: cc int3 +// CHECK-NEXT: 102a: cc int3 +// CHECK-NEXT: 102b: cc int3 +// CHECK-NEXT: 102c: cc int3 +// CHECK-NEXT: 102d: cc int3 +// CHECK-NEXT: 102e: cc int3 +// CHECK-NEXT: 102f: cc int3 +// CHECK-NEXT: 1030: 4c 8b 1d c1 10 00 00 movq 4289(%rip), %r11 +// CHECK-NEXT: 1037: e9 d4 ff ff ff jmp -44 <.plt> +// CHECK-NEXT: 103c: cc int3 +// CHECK-NEXT: 103d: cc int3 +// CHECK-NEXT: 103e: cc int3 +// CHECK-NEXT: 103f: cc int3 +// CHECK-NEXT: 1040: 4c 8b 1d b9 10 00 00 movq 4281(%rip), %r11 +// CHECK-NEXT: 1047: e9 c4 ff ff ff jmp -60 <.plt> +// CHECK-NEXT: 104c: cc int3 +// CHECK-NEXT: 104d: cc int3 +// CHECK-NEXT: 104e: cc int3 +// CHECK-NEXT: 104f: cc int3 + +// CHECK: Contents of section .got.plt: +// CHECK-NEXT: 20e0 00200000 00000000 00000000 00000000 +// CHECK-NEXT: 20f0 00000000 00000000 00000000 00000000 +// CHECK-NEXT: 2100 00000000 00000000 + +.global _start +_start: + jmp bar@PLT + jmp zed@PLT Index: lld/test/ELF/x86-64-retpoline.s =================================================================== --- /dev/null +++ lld/test/ELF/x86-64-retpoline.s @@ -0,0 +1,66 @@ +// REQUIRES: x86 +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1.o +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %p/Inputs/shared.s -o %t2.o +// RUN: ld.lld -shared %t2.o -o %t2.so + +// RUN: ld.lld -shared %t1.o %t2.so -o %t.exe -z retpolineplt +// RUN: llvm-objdump -d -s %t.exe | FileCheck %s + +// CHECK: Disassembly of section .plt: +// CHECK-NEXT: .plt: +// CHECK-NEXT: 1010: ff 35 f2 0f 00 00 pushq 4082(%rip) +// CHECK-NEXT: 1016: 4c 8b 1d f3 0f 00 00 movq 4083(%rip), %r11 +// CHECK-NEXT: 101d: e8 0e 00 00 00 callq 14 <.plt+0x20> +// CHECK-NEXT: 1022: f3 90 pause +// CHECK-NEXT: 1024: 0f ae e8 lfence +// CHECK-NEXT: 1027: eb f9 jmp -7 <.plt+0x12> +// CHECK-NEXT: 1029: cc int3 +// CHECK-NEXT: 102a: cc int3 +// CHECK-NEXT: 102b: cc int3 +// CHECK-NEXT: 102c: cc int3 +// CHECK-NEXT: 102d: cc int3 +// CHECK-NEXT: 102e: cc int3 +// CHECK-NEXT: 102f: cc int3 +// CHECK-NEXT: 1030: 4c 89 1c 24 movq %r11, (%rsp) +// CHECK-NEXT: 1034: c3 retq +// CHECK-NEXT: 1035: cc int3 +// CHECK-NEXT: 1036: cc int3 +// CHECK-NEXT: 1037: cc int3 +// CHECK-NEXT: 1038: cc int3 +// CHECK-NEXT: 1039: cc int3 +// CHECK-NEXT: 103a: cc int3 +// CHECK-NEXT: 103b: cc int3 +// CHECK-NEXT: 103c: cc int3 +// CHECK-NEXT: 103d: cc int3 +// CHECK-NEXT: 103e: cc int3 +// CHECK-NEXT: 103f: cc int3 +// CHECK-NEXT: 1040: 4c 8b 1d d1 0f 00 00 movq 4049(%rip), %r11 +// CHECK-NEXT: 1047: e8 e4 ff ff ff callq -28 <.plt+0x20> +// CHECK-NEXT: 104c: e9 d1 ff ff ff jmp -47 <.plt+0x12> +// CHECK-NEXT: 1051: 68 00 00 00 00 pushq $0 +// CHECK-NEXT: 1056: e9 b5 ff ff ff jmp -75 <.plt> +// CHECK-NEXT: 105b: cc int3 +// CHECK-NEXT: 105c: cc int3 +// CHECK-NEXT: 105d: cc int3 +// CHECK-NEXT: 105e: cc int3 +// CHECK-NEXT: 105f: cc int3 +// CHECK-NEXT: 1060: 4c 8b 1d b9 0f 00 00 movq 4025(%rip), %r11 +// CHECK-NEXT: 1067: e8 c4 ff ff ff callq -60 <.plt+0x20> +// CHECK-NEXT: 106c: e9 b1 ff ff ff jmp -79 <.plt+0x12> +// CHECK-NEXT: 1071: 68 01 00 00 00 pushq $1 +// CHECK-NEXT: 1076: e9 95 ff ff ff jmp -107 <.plt> +// CHECK-NEXT: 107b: cc int3 +// CHECK-NEXT: 107c: cc int3 +// CHECK-NEXT: 107d: cc int3 +// CHECK-NEXT: 107e: cc int3 +// CHECK-NEXT: 107f: cc int3 + +// CHECK: Contents of section .got.plt: +// CHECK-NEXT: 2000 00300000 00000000 00000000 00000000 +// CHECK-NEXT: 2010 00000000 00000000 51100000 00000000 +// CHECK-NEXT: 2020 71100000 00000000 + +.global _start +_start: + jmp bar@PLT + jmp zed@PLT Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -417,6 +417,9 @@ // This pass expands memcmp() to load/stores. FunctionPass *createExpandMemCmpPass(); + // This pass expands indirectbr instructions. + FunctionPass *createIndirectBrExpandPass(); + } // End llvm namespace #endif Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -800,7 +800,7 @@ } /// Return true if lowering to a jump table is allowed. - bool areJTsAllowed(const Function *Fn) const { + virtual bool areJTsAllowed(const Function *Fn) const { if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") return false; Index: llvm/include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- llvm/include/llvm/CodeGen/TargetPassConfig.h +++ llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -409,6 +409,10 @@ /// immediately before machine code is emitted. virtual void addPreEmitPass() { } + /// This pass may be implemented by targets that want to run passes + /// that emit MI directly and bypass all other machine passes. + virtual void addEmitPass() {} + /// Utilities for targets to add passes to the pass manager. /// Index: llvm/include/llvm/CodeGen/TargetSubtargetInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -174,6 +174,9 @@ /// \brief True if the subtarget should run the atomic expansion pass. virtual bool enableAtomicExpand() const; + /// True if the subtarget should run the indirectbr expansion pass. + virtual bool enableIndirectBrExpand() const; + /// \brief Override generic scheduling policy within a region. /// /// This is a convenient way for targets that don't provide any custom Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -161,6 +161,7 @@ void initializeIfConverterPass(PassRegistry&); void initializeImplicitNullChecksPass(PassRegistry&); void initializeIndVarSimplifyLegacyPassPass(PassRegistry&); +void initializeIndirectBrExpandPassPass(PassRegistry&); void initializeInductiveRangeCheckEliminationPass(PassRegistry&); void initializeInferAddressSpacesPass(PassRegistry&); void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&); Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ GlobalMerge.cpp IfConversion.cpp ImplicitNullChecks.cpp + IndirectBrExpandPass.cpp InlineSpiller.cpp InterferenceCache.cpp InterleavedAccessPass.cpp Index: llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- llvm/lib/CodeGen/CodeGen.cpp +++ llvm/lib/CodeGen/CodeGen.cpp @@ -38,6 +38,7 @@ initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); Index: llvm/lib/CodeGen/IndirectBrExpandPass.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/IndirectBrExpandPass.cpp @@ -0,0 +1,221 @@ +//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Implements an expansion pass to turn `indirectbr` instructions in the IR +/// into `switch` instructions. This works by enumerating the basic blocks in +/// a dense range of integers, replacing each `blockaddr` constant with the +/// corresponding integer constant, and then building a switch that maps from +/// the integers to the actual blocks. All of the indirectbr instructions in the +/// function are redirected to this common switch. +/// +/// While this is generically useful if a target is unable to codegen +/// `indirectbr` natively, it is primarily useful when there is some desire to +/// get the builtin non-jump-table lowering of a switch even when the input +/// source contained an explicit indirect branch construct. +/// +/// Note that it doesn't make any sense to enable this pass unless a target also +/// disables jump-table lowering of switches. Doing that is likely to pessimize +/// the code. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "indirectbr-expand" + +namespace { + +class IndirectBrExpandPass : public FunctionPass { + const TargetLowering *TLI = nullptr; + +public: + static char ID; // Pass identification, replacement for typeid + + IndirectBrExpandPass() : FunctionPass(ID) { + initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char IndirectBrExpandPass::ID = 0; + +INITIALIZE_PASS(IndirectBrExpandPass, DEBUG_TYPE, + "Expand indirectbr instructions", false, false) + +FunctionPass *llvm::createIndirectBrExpandPass() { + return new IndirectBrExpandPass(); +} + +bool IndirectBrExpandPass::runOnFunction(Function &F) { + auto &DL = F.getParent()->getDataLayout(); + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + auto &TM = TPC->getTM(); + auto &STI = *TM.getSubtargetImpl(F); + if (!STI.enableIndirectBrExpand()) + return false; + TLI = STI.getTargetLowering(); + + SmallVector IndirectBrs; + + // Set of all potential successors for indirectbr instructions. + SmallPtrSet IndirectBrSuccs; + + // Build a list of indirectbrs that we want to rewrite. + for (BasicBlock &BB : F) + if (auto *IBr = dyn_cast(BB.getTerminator())) { + // Handle the degenerate case of no successors by replacing the indirectbr + // with unreachable as there is no successor available. + if (IBr->getNumSuccessors() == 0) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + continue; + } + + IndirectBrs.push_back(IBr); + for (BasicBlock *SuccBB : IBr->successors()) + IndirectBrSuccs.insert(SuccBB); + } + + if (IndirectBrs.empty()) + return false; + + // If we need to replace any indirectbrs we need to establish integer + // constants that will correspond to each of the basic blocks in the function + // whose address escapes. We do that here and rewrite all the blockaddress + // constants to just be those integer constants cast to a pointer type. + SmallVector BBs; + + for (BasicBlock &BB : F) { + // Skip blocks that aren't successors to an indirectbr we're going to + // rewrite. + if (!IndirectBrSuccs.count(&BB)) + continue; + + auto IsBlockAddressUse = [&](const Use &U) { + return isa(U.getUser()); + }; + auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse); + if (BlockAddressUseIt == BB.use_end()) + continue; + + assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(), + IsBlockAddressUse) == BB.use_end() && + "There should only ever be a single blockaddress use because it is " + "a constant and should be uniqued."); + + auto *BA = cast(BlockAddressUseIt->getUser()); + + // Skip if the constant was formed but ended up not being used (due to DCE + // or whatever). + if (!BA->isConstantUsed()) + continue; + + // Compute the index we want to use for this basic block. We can't use zero + // because null can be compared with block addresses. + int BBIndex = BBs.size() + 1; + BBs.push_back(&BB); + + auto *ITy = cast(DL.getIntPtrType(BA->getType())); + ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex); + + // Now rewrite the blockaddress to an integer constant based on the index. + // FIXME: We could potentially preserve the uses as arguments to inline asm. + // This would allow some uses such as diagnostic information in crashes to + // have higher quality even when this transform is enabled, but would break + // users that round-trip blockaddresses through inline assembly and then + // back into an indirectbr. + BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType())); + } + + if (BBs.empty()) { + // There are no blocks whose address is taken, so any indirectbr instruction + // cannot get a valid input and we can replace all of them with unreachable. + for (auto *IBr : IndirectBrs) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + } + return true; + } + + BasicBlock *SwitchBB; + Value *SwitchValue; + + // Compute a common integer type across all the indirectbr instructions. + IntegerType *CommonITy = nullptr; + for (auto *IBr : IndirectBrs) { + auto *ITy = + cast(DL.getIntPtrType(IBr->getAddress()->getType())); + if (!CommonITy || ITy->getBitWidth() > CommonITy->getBitWidth()) + CommonITy = ITy; + } + + auto GetSwitchValue = [DL, CommonITy](IndirectBrInst *IBr) { + return CastInst::CreatePointerCast( + IBr->getAddress(), CommonITy, + Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr); + }; + + if (IndirectBrs.size() == 1) { + // If we only have one indirectbr, we can just directly replace it within + // its block. + SwitchBB = IndirectBrs[0]->getParent(); + SwitchValue = GetSwitchValue(IndirectBrs[0]); + IndirectBrs[0]->eraseFromParent(); + } else { + // Otherwise we need to create a new block to hold the switch across BBs, + // jump to that block instead of each indirectbr, and phi together the + // values for the switch. + SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F); + auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(), + "switch_value_phi", SwitchBB); + SwitchValue = SwitchPN; + + // Now replace the indirectbr instructions with direct branches to the + // switch block and fill out the PHI operands. + for (auto *IBr : IndirectBrs) { + SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent()); + BranchInst::Create(SwitchBB, IBr); + IBr->eraseFromParent(); + } + } + + // Now build the switch in the block. The block will have no terminator + // already. + auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB); + + // Add a case for each block. + for (int i : llvm::seq(1, BBs.size())) + SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]); + + return true; +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -904,6 +904,9 @@ if (EnableMachineOutliner) PM->add(createMachineOutlinerPass(EnableLinkOnceODROutlining)); + // Add passes that directly emit MI after all other MI passes. + addEmitPass(); + AddingMachinePasses = false; } Index: llvm/lib/CodeGen/TargetSubtargetInfo.cpp =================================================================== --- llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -38,6 +38,10 @@ return true; } +bool TargetSubtargetInfo::enableIndirectBrExpand() const { + return false; +} + bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } Index: llvm/lib/Target/X86/CMakeLists.txt =================================================================== --- llvm/lib/Target/X86/CMakeLists.txt +++ llvm/lib/Target/X86/CMakeLists.txt @@ -50,6 +50,7 @@ X86PadShortFunction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp + X86RetpolineThunks.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp X86Subtarget.cpp Index: llvm/lib/Target/X86/X86.h =================================================================== --- llvm/lib/Target/X86/X86.h +++ llvm/lib/Target/X86/X86.h @@ -22,6 +22,7 @@ class FunctionPass; class ImmutablePass; class InstructionSelector; +class ModulePass; class PassRegistry; class X86RegisterBankInfo; class X86Subtarget; @@ -106,6 +107,9 @@ /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); +/// This pass creates the thunks for the retpoline feature. +ModulePass *createX86RetpolineThunksPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); Index: llvm/lib/Target/X86/X86.td =================================================================== --- llvm/lib/Target/X86/X86.td +++ llvm/lib/Target/X86/X86.td @@ -332,6 +332,27 @@ : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast.">; +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// Index: llvm/lib/Target/X86/X86AsmPrinter.h =================================================================== --- llvm/lib/Target/X86/X86AsmPrinter.h +++ llvm/lib/Target/X86/X86AsmPrinter.h @@ -32,6 +32,7 @@ FaultMaps FM; std::unique_ptr CodeEmitter; bool EmitFPOData = false; + bool NeedsRetpoline = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow Index: llvm/lib/Target/X86/X86FastISel.cpp =================================================================== --- llvm/lib/Target/X86/X86FastISel.cpp +++ llvm/lib/Target/X86/X86FastISel.cpp @@ -3172,6 +3172,10 @@ (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; + // Functions using retpoline should use SDISel for calls. + if (Subtarget->useRetpoline()) + return false; + // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; Index: llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.cpp +++ llvm/lib/Target/X86/X86FrameLowering.cpp @@ -741,6 +741,11 @@ bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + // FIXME: Add retpoline support and remove this. + if (Is64Bit && IsLargeCodeModel && STI.useRetpoline()) + report_fatal_error("Emitting stack probe calls on 64-bit with the large " + "code model and retpoline not yet implemented."); + unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; @@ -2344,6 +2349,10 @@ // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. + // FIXME: Add retpoline support and remove the error here.. + if (STI.useRetpoline()) + report_fatal_error("Emitting morestack calls on 64-bit with the large " + "code model and retpoline not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -629,11 +629,11 @@ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - // Only does this when target favors doesn't favor register indirect - // call. + // Only do this when the target can fold the load into the call or + // jmp. + !Subtarget->useRetpoline() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && - // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -984,6 +984,9 @@ bool isVectorClearMaskLegal(const SmallVectorImpl &Mask, EVT VT) const override; + /// Returns true if lowering to a jump table is allowed. + bool areJTsAllowed(const Function *Fn) const override; + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. @@ -1296,6 +1299,9 @@ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25735,6 +25735,15 @@ return isShuffleMaskLegal(Mask, VT); } +bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { + // If the subtarget is using retpolines, we need to not generate jump tables. + if (Subtarget.useRetpoline()) + return false; + + // Otherwise, fallback on the generic logic. + return TargetLowering::areJTsAllowed(Fn); +} + //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// @@ -27037,6 +27046,115 @@ return BB; } +static unsigned getOpcodeForRetpoline(unsigned RPOpc) { + switch (RPOpc) { + case X86::RETPOLINE_CALL32: + return X86::CALLpcrel32; + case X86::RETPOLINE_CALL64: + return X86::CALL64pcrel32; + case X86::RETPOLINE_TCRETURN32: + return X86::TCRETURNdi; + case X86::RETPOLINE_TCRETURN64: + return X86::TCRETURNdi64; + } + llvm_unreachable("not retpoline opcode"); +} + +static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, + unsigned Reg) { + switch (Reg) { + case 0: + assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_push" + : "__llvm_retpoline_push"; + case X86::EAX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_eax" + : "__llvm_retpoline_eax"; + case X86::ECX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_ecx" + : "__llvm_retpoline_ecx"; + case X86::EDX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_edx" + : "__llvm_retpoline_edx"; + case X86::R11: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_r11" + : "__llvm_retpoline_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const { + // Copy the virtual register into the R11 physical register and + // call the retpoline thunk. + DebugLoc DL = MI.getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + unsigned CalleeVReg = MI.getOperand(0).getReg(); + unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); + + // Find an available scratch register to hold the callee. On 64-bit, we can + // just use R11, but we scan for uses anyway to ensure we don't generate + // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't + // already a register use operand to the call to hold the callee. If none + // are available, push the callee instead. This is less efficient, but is + // necessary for functions using 3 regparms. Such function calls are + // (currently) not eligible for tail call optimization, because there is no + // scratch register available to hold the address of the callee. + SmallVector AvailableRegs; + if (Subtarget.is64Bit()) + AvailableRegs.push_back(X86::R11); + else + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + + // Zero out any registers that are already used. + for (const auto &MO : MI.operands()) { + if (MO.isReg() && MO.isUse()) + for (unsigned &Reg : AvailableRegs) + if (Reg == MO.getReg()) + Reg = 0; + } + + // Choose the first remaining non-zero available register. + unsigned AvailableReg = 0; + for (unsigned MaybeReg : AvailableRegs) { + if (MaybeReg) { + AvailableReg = MaybeReg; + break; + } + } + + const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); + + if (AvailableReg == 0) { + // No register available. Use PUSH. This must not be a tailcall, and this + // must not be x64. + if (Subtarget.is64Bit()) + report_fatal_error( + "Cannot make an indirect call on x86-64 using both retpoline and a " + "calling convention that preservers r11"); + if (Opc != X86::CALLpcrel32) + report_fatal_error("Cannot make an indirect tail call on x86 using " + "retpoline without a preserved register"); + BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + } else { + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); + } + return BB; +} + MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -27542,6 +27660,11 @@ case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); + case X86::RETPOLINE_CALL32: + case X86::RETPOLINE_CALL64: + case X86::RETPOLINE_TCRETURN32: + case X86::RETPOLINE_TCRETURN64: + return EmitLoweredRetpoline(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: Index: llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/lib/Target/X86/X86InstrCompiler.td +++ llvm/lib/Target/X86/X86InstrCompiler.td @@ -1146,14 +1146,14 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[Not64BitMode, NotUseRetpoline]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[Not64BitMode, IsNotPIC]>; + Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi tglobaladdr:$dst, imm:$off)>, @@ -1165,13 +1165,21 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseRetpoline]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, Index: llvm/lib/Target/X86/X86InstrControl.td =================================================================== --- llvm/lib/Target/X86/X86InstrControl.td +++ llvm/lib/Target/X86/X86InstrControl.td @@ -211,11 +211,12 @@ Sched<[WriteJumpLd]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, - OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>, + Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, OpSize32, - Requires<[Not64BitMode,FavorMemIndirectCall]>, + Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>, Sched<[WriteJumpLd]>; let Predicates = [Not64BitMode] in { @@ -298,11 +299,12 @@ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)], IIC_CALL_RI>, - Requires<[In64BitMode]>; + Requires<[In64BitMode,NotUseRetpoline]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, - Requires<[In64BitMode,FavorMemIndirectCall]>; + Requires<[In64BitMode,FavorMemIndirectCall, + NotUseRetpoline]>; def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; @@ -340,6 +342,27 @@ } } +let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, + Uses = [RSP, SSP], + usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def RETPOLINE_CALL32 : + PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, + Requires<[Not64BitMode,UseRetpoline]>; + + def RETPOLINE_CALL64 : + PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, + Requires<[In64BitMode,UseRetpoline]>; + + // Retpoline variant of indirect tail calls. + let isTerminator = 1, isReturn = 1, isBarrier = 1 in { + def RETPOLINE_TCRETURN64 : + PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; + def RETPOLINE_TCRETURN32 : + PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; + } +} + // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, Index: llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.td +++ llvm/lib/Target/X86/X86InstrInfo.td @@ -925,6 +925,8 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. Index: llvm/lib/Target/X86/X86MCInstLower.cpp =================================================================== --- llvm/lib/Target/X86/X86MCInstLower.cpp +++ llvm/lib/Target/X86/X86MCInstLower.cpp @@ -874,6 +874,10 @@ // address is to far away. (TODO: support non-relative addressing) break; case MachineOperand::MO_Register: + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error("Lowering register statepoints with retpoline not " + "yet implemented."); CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); CallOpcode = X86::CALL64r; break; @@ -1028,6 +1032,10 @@ EmitAndCountInstruction( MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error( + "Lowering patchpoint with retpoline not yet implemented."); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } Index: llvm/lib/Target/X86/X86RetpolineThunks.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86RetpolineThunks.cpp @@ -0,0 +1,276 @@ +//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that injects an MI thunk implementing a "retpoline". This is +/// a RET-implemented trampoline that is used to lower indirect calls in a way +/// that prevents speculation on some x86 processors and can be used to mitigate +/// security vulnerabilities due to targeted speculative execution and side +/// channels such as CVE-2017-5715. +/// +/// TODO(chandlerc): All of this code could use better comments and +/// documentation. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-retpoline-thunks" + +namespace { +class X86RetpolineThunks : public ModulePass { +public: + static char ID; + + X86RetpolineThunks() : ModulePass(ID) {} + + StringRef getPassName() const override { return "X86 Retpoline Thunks"; } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + } + +private: + MachineModuleInfo *MMI; + const TargetMachine *TM; + bool Is64Bit; + const X86Subtarget *STI; + const X86InstrInfo *TII; + + Function *createThunkFunction(Module &M, StringRef Name); + void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); + void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); + void createThunk(Module &M, StringRef NameSuffix, + Optional Reg = None); +}; + +} // end anonymous namespace + +ModulePass *llvm::createX86RetpolineThunksPass() { + return new X86RetpolineThunks(); +} + +char X86RetpolineThunks::ID = 0; + +bool X86RetpolineThunks::runOnModule(Module &M) { + DEBUG(dbgs() << getPassName() << '\n'); + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + MMI = &getAnalysis(); + TM = &TPC->getTM(); + Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; + + // Only add a thunk if we have at least one function that has the retpoline + // feature enabled in its subtarget. + // FIXME: Conditionalize on indirect calls so we don't emit a thunk when + // nothing will end up calling it. + // FIXME: It's a little silly to look at every function just to enumerate + // the subtargets, but eventually we'll want to look at them for indirect + // calls, so maybe this is OK. + if (!llvm::any_of(M, [&](const Function &F) { + // Save the subtarget we find for use in emitting the subsequent + // thunk. + STI = &TM->getSubtarget(F); + return STI->useRetpoline() && !STI->useRetpolineExternalThunk(); + })) + return false; + + // If we have a relevant subtarget, get the instr info as well. + TII = STI->getInstrInfo(); + + if (Is64Bit) { + // __llvm_retpoline_r11: + // callq .Lr11_call_target + // .Lr11_capture_spec: + // pause + // lfence + // jmp .Lr11_capture_spec + // .align 16 + // .Lr11_call_target: + // movq %r11, (%rsp) + // retq + + createThunk(M, "r11", X86::R11); + } else { + // For 32-bit targets we need to emit a collection of thunks for various + // possible scratch registers as well as a fallback that is used when + // there are no scratch registers and assumes the retpoline target has + // been pushed. + // __llvm_retpoline_eax: + // calll .Leax_call_target + // .Leax_capture_spec: + // pause + // jmp .Leax_capture_spec + // .align 16 + // .Leax_call_target: + // movl %eax, (%esp) # Clobber return addr + // retl + // + // __llvm_retpoline_ecx: + // ... # Same setup + // movl %ecx, (%esp) + // retl + // + // __llvm_retpoline_edx: + // ... # Same setup + // movl %edx, (%esp) + // retl + // + // This last one is a bit more special and so needs a little extra + // handling. + // __llvm_retpoline_push: + // calll .Lpush_call_target + // .Lpush_capture_spec: + // pause + // lfence + // jmp .Lpush_capture_spec + // .align 16 + // .Lpush_call_target: + // # Clear pause_loop return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + createThunk(M, "eax", X86::EAX); + createThunk(M, "ecx", X86::ECX); + createThunk(M, "edx", X86::EDX); + createThunk(M, "push"); + } + + return true; +} + +Function *X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) { + LLVMContext &Ctx = M.getContext(); + auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = + Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); + F->setVisibility(GlobalValue::HiddenVisibility); + F->setComdat(M.getOrInsertComdat(Name)); + + // Add Attributes so that we don't create a frame, unwind information, or + // inline. + AttrBuilder B; + B.addAttribute(llvm::Attribute::NoUnwind); + B.addAttribute(llvm::Attribute::Naked); + F->addAttributes(llvm::AttributeList::FunctionIndex, B); + + // Populate our function a bit so that we can verify. + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); + IRBuilder<> Builder(Entry); + + Builder.CreateRetVoid(); + return F; +} + +void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, + unsigned Reg) { + const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; + const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP; + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0) + .addReg(Reg); +} +void X86RetpolineThunks::insert32BitPushReturnAddrClobber( + MachineBasicBlock &MBB) { + // The instruction sequence we use to replace the return address without + // a scratch register is somewhat complicated: + // # Clear capture_spec from return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) + .addReg(X86::ESP) + .addImm(4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 8); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 0); +} + +void X86RetpolineThunks::createThunk(Module &M, StringRef NameSuffix, + Optional Reg) { + Function &F = + *createThunkFunction(M, (Twine("__llvm_retpoline_") + NameSuffix).str()); + MachineFunction &MF = MMI->getOrCreateMachineFunction(F); + + // Set MF properties. We never use vregs... + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + + BasicBlock &OrigEntryBB = F.getEntryBlock(); + MachineBasicBlock *Entry = MF.CreateMachineBasicBlock(&OrigEntryBB); + MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(&OrigEntryBB); + MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(&OrigEntryBB); + + MF.push_back(Entry); + MF.push_back(CaptureSpec); + MF.push_back(CallTarget); + + const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; + + BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget); + Entry->addSuccessor(CallTarget); + Entry->addSuccessor(CaptureSpec); + CallTarget->setHasAddressTaken(); + + // In the capture loop for speculation, we want to stop the processor from + // speculating as fast as possible. On Intel processors, the PAUSE instruction + // will block speculation without consuming any execution resources. On AMD + // processors, the PAUSE instruction is (essentially) a nop, so we also use an + // LFENCE instruction which they have advised will stop speculation as well + // with minimal resource utilization. We still end the capture with a jump to + // form an infinite loop to fully guarantee that no matter what implementation + // of the x86 ISA, speculating this code path never escapes. + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); + CaptureSpec->setHasAddressTaken(); + CaptureSpec->addSuccessor(CaptureSpec); + + CallTarget->setAlignment(4); + if (Reg) { + insertRegReturnAddrClobber(*CallTarget, *Reg); + } else { + assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); + insert32BitPushReturnAddrClobber(*CallTarget); + } + BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); +} Index: llvm/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/lib/Target/X86/X86Subtarget.h +++ llvm/lib/Target/X86/X86Subtarget.h @@ -345,6 +345,14 @@ /// Processor supports Cache Line Write Back instruction bool HasCLWB; + /// Use a retpoline thunk rather than indirect calls to block speculative + /// execution. + bool UseRetpoline; + + /// When using a retpoline thunk, call an externally provided thunk rather + /// than emitting one inside the compiler. + bool UseRetpolineExternalThunk; + /// Use software floating point for code generation. bool UseSoftFloat; @@ -579,6 +587,8 @@ bool hasIBT() const { return HasIBT; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } + bool useRetpoline() const { return UseRetpoline; } + bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } bool isXRaySupported() const override { return is64Bit(); } @@ -701,6 +711,10 @@ /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; + /// If we are using retpolines, we need to expand indirectbr to avoid it + /// lowering to an actual indirect jump. + bool enableIndirectBrExpand() const override { return useRetpoline(); } + /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } Index: llvm/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/lib/Target/X86/X86Subtarget.cpp +++ llvm/lib/Target/X86/X86Subtarget.cpp @@ -315,6 +315,8 @@ HasSGX = false; HasCLFLUSHOPT = false; HasCLWB = false; + UseRetpoline = false; + UseRetpolineExternalThunk = false; IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; Index: llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetMachine.cpp +++ llvm/lib/Target/X86/X86TargetMachine.cpp @@ -322,6 +322,7 @@ void addPostRegAlloc() override; void addPreEmitPass() override; void addPreSched2() override; + void addEmitPass() override; }; class X86ExecutionDepsFix : public ExecutionDepsFix { @@ -350,6 +351,11 @@ if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); + + // Add passes that handle indirect branch removal and insertion of a retpoline + // thunk. These will be a no-op unless a function subtarget has the retpoline + // feature enabled. + addPass(createIndirectBrExpandPass()); } bool X86PassConfig::addInstSelector() { @@ -438,3 +444,5 @@ addPass(createX86EvexToVexInsts()); } } + +void X86PassConfig::addEmitPass() { addPass(createX86RetpolineThunksPass()); } Index: llvm/test/CodeGen/X86/O0-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/O0-pipeline.ll +++ llvm/test/CodeGen/X86/O0-pipeline.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Rewrite Symbols ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction @@ -58,6 +59,8 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Insert XRay ops ; CHECK-NEXT: Implement the 'patchable-function' attribute +; CHECK-NEXT: X86 Retpoline Thunks +; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: MachineDominator Tree Construction Index: llvm/test/CodeGen/X86/retpoline-external.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/retpoline-external.ll @@ -0,0 +1,166 @@ +; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64 +; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST + +; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86 +; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST + +declare void @bar(i32) + +; Test a simple indirect call and tail call. +define void @icall_reg(void (i32)* %fp, i32 %x) #0 { +entry: + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + ret void +} + +; X64-LABEL: icall_reg: +; X64-DAG: movq %rdi, %[[fp:[^ ]*]] +; X64-DAG: movl %esi, %[[x:[^ ]*]] +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_reg: +; X64FAST: callq bar +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq bar +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_reg: +; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] +; X86-DAG: movl 16(%esp), %[[x:[^ ]*]] +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_external_retpoline_eax +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_external_retpoline_eax +; X86-NOT: # TAILCALL + +; X86FAST-LABEL: icall_reg: +; X86FAST: calll bar +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll bar +; X86FAST: calll __llvm_external_retpoline_eax + + +@global_fp = external global void (i32)* + +; Test an indirect call through a global variable. +define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { + %fp1 = load void (i32)*, void (i32)** @global_fp + call void %fp1(i32 %x) + %fp2 = load void (i32)*, void (i32)** @global_fp + tail call void %fp2(i32 %x) + ret void +} + +; X64-LABEL: icall_global_fp: +; X64-DAG: movl %edi, %[[x:[^ ]*]] +; X64-DAG: movq global_fp(%rip), %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq global_fp(%rip), %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_global_fp: +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_global_fp: +; X86: movl global_fp, %eax +; X86: pushl 4(%esp) +; X86: calll __llvm_external_retpoline_eax +; X86: addl $4, %esp +; X86: movl global_fp, %eax +; X86: jmp __llvm_external_retpoline_eax # TAILCALL + +; X86FAST-LABEL: icall_global_fp: +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL + + +%struct.Foo = type { void (%struct.Foo*)** } + +; Test an indirect call through a vtable. +define void @vcall(%struct.Foo* %obj) #0 { + %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0 + %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field + %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1 + %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot + tail call void %fp(%struct.Foo* %obj) + tail call void %fp(%struct.Foo* %obj) + ret void +} + +; X64-LABEL: vcall: +; X64: movq %rdi, %[[obj:[^ ]*]] +; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] +; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] +; X64: movq %[[fp]], %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64-DAG: movq %[[obj]], %rdi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: vcall: +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: vcall: +; X86: movl 8(%esp), %[[obj:[^ ]*]] +; X86: movl (%[[obj]]), %[[vptr:[^ ]*]] +; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] +; X86: movl %[[fp]], %eax +; X86: pushl %[[obj]] +; X86: calll __llvm_external_retpoline_eax +; X86: addl $4, %esp +; X86: movl %[[fp]], %eax +; X86: jmp __llvm_external_retpoline_eax # TAILCALL + +; X86FAST-LABEL: vcall: +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL + + +declare void @direct_callee() + +define void @direct_tail() #0 { + tail call void @direct_callee() + ret void +} + +; X64-LABEL: direct_tail: +; X64: jmp direct_callee # TAILCALL +; X64FAST-LABEL: direct_tail: +; X64FAST: jmp direct_callee # TAILCALL +; X86-LABEL: direct_tail: +; X86: jmp direct_callee # TAILCALL +; X86FAST-LABEL: direct_tail: +; X86FAST: jmp direct_callee # TAILCALL + + +; Lastly check that no thunks were emitted. +; X64-NOT: __{{.*}}_retpoline_{{.*}}: +; X64FAST-NOT: __{{.*}}_retpoline_{{.*}}: +; X86-NOT: __{{.*}}_retpoline_{{.*}}: +; X86FAST-NOT: __{{.*}}_retpoline_{{.*}}: + + +attributes #0 = { "target-features"="+retpoline-external-thunk" } Index: llvm/test/CodeGen/X86/retpoline.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/retpoline.ll @@ -0,0 +1,367 @@ +; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64 +; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST + +; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86 +; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST + +declare void @bar(i32) + +; Test a simple indirect call and tail call. +define void @icall_reg(void (i32)* %fp, i32 %x) #0 { +entry: + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + ret void +} + +; X64-LABEL: icall_reg: +; X64-DAG: movq %rdi, %[[fp:[^ ]*]] +; X64-DAG: movl %esi, %[[x:[^ ]*]] +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_reg: +; X64FAST: callq bar +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: callq bar +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_reg: +; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] +; X86-DAG: movl 16(%esp), %[[x:[^ ]*]] +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_retpoline_eax +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_retpoline_eax +; X86-NOT: # TAILCALL + +; X86FAST-LABEL: icall_reg: +; X86FAST: calll bar +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: calll bar +; X86FAST: calll __llvm_retpoline_eax + + +@global_fp = external global void (i32)* + +; Test an indirect call through a global variable. +define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { + %fp1 = load void (i32)*, void (i32)** @global_fp + call void %fp1(i32 %x) + %fp2 = load void (i32)*, void (i32)** @global_fp + tail call void %fp2(i32 %x) + ret void +} + +; X64-LABEL: icall_global_fp: +; X64-DAG: movl %edi, %[[x:[^ ]*]] +; X64-DAG: movq global_fp(%rip), %r11 +; X64: callq __llvm_retpoline_r11 +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq global_fp(%rip), %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_global_fp: +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_global_fp: +; X86: movl global_fp, %eax +; X86: pushl 4(%esp) +; X86: calll __llvm_retpoline_eax +; X86: addl $4, %esp +; X86: movl global_fp, %eax +; X86: jmp __llvm_retpoline_eax # TAILCALL + +; X86FAST-LABEL: icall_global_fp: +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: jmp __llvm_retpoline_eax # TAILCALL + + +%struct.Foo = type { void (%struct.Foo*)** } + +; Test an indirect call through a vtable. +define void @vcall(%struct.Foo* %obj) #0 { + %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0 + %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field + %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1 + %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot + tail call void %fp(%struct.Foo* %obj) + tail call void %fp(%struct.Foo* %obj) + ret void +} + +; X64-LABEL: vcall: +; X64: movq %rdi, %[[obj:[^ ]*]] +; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] +; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] +; X64: movq %[[fp]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64-DAG: movq %[[obj]], %rdi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: vcall: +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: vcall: +; X86: movl 8(%esp), %[[obj:[^ ]*]] +; X86: movl (%[[obj]]), %[[vptr:[^ ]*]] +; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] +; X86: movl %[[fp]], %eax +; X86: pushl %[[obj]] +; X86: calll __llvm_retpoline_eax +; X86: addl $4, %esp +; X86: movl %[[fp]], %eax +; X86: jmp __llvm_retpoline_eax # TAILCALL + +; X86FAST-LABEL: vcall: +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: jmp __llvm_retpoline_eax # TAILCALL + + +declare void @direct_callee() + +define void @direct_tail() #0 { + tail call void @direct_callee() + ret void +} + +; X64-LABEL: direct_tail: +; X64: jmp direct_callee # TAILCALL +; X64FAST-LABEL: direct_tail: +; X64FAST: jmp direct_callee # TAILCALL +; X86-LABEL: direct_tail: +; X86: jmp direct_callee # TAILCALL +; X86FAST-LABEL: direct_tail: +; X86FAST: jmp direct_callee # TAILCALL + + +declare void @nonlazybind_callee() #1 + +define void @nonlazybind_caller() #0 { + call void @nonlazybind_callee() + tail call void @nonlazybind_callee() + ret void +} + +; X64-LABEL: nonlazybind_caller: +; X64: movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]] +; X64: movq %[[REG]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64: movq %[[REG]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL +; X64FAST-LABEL: nonlazybind_caller: +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL +; X86-LABEL: nonlazybind_caller: +; X86: calll nonlazybind_callee@PLT +; X86: jmp nonlazybind_callee@PLT # TAILCALL +; X86FAST-LABEL: nonlazybind_caller: +; X86FAST: calll nonlazybind_callee@PLT +; X86FAST: jmp nonlazybind_callee@PLT # TAILCALL + + +@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0), + i8* blockaddress(@indirectbr_rewrite, %bb1), + i8* blockaddress(@indirectbr_rewrite, %bb2), + i8* blockaddress(@indirectbr_rewrite, %bb3), + i8* blockaddress(@indirectbr_rewrite, %bb4), + i8* blockaddress(@indirectbr_rewrite, %bb5), + i8* blockaddress(@indirectbr_rewrite, %bb6), + i8* blockaddress(@indirectbr_rewrite, %bb7), + i8* blockaddress(@indirectbr_rewrite, %bb8), + i8* blockaddress(@indirectbr_rewrite, %bb9)] + +; Check that when retpolines are enabled a function with indirectbr gets +; rewritten to use switch, and that in turn doesn't get lowered as a jump +; table. +define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 { +; X64-LABEL: indirectbr_rewrite: +; X64-NOT: jmpq +; X86-LABEL: indirectbr_rewrite: +; X86-NOT: jmpl +entry: + %i0 = load i64, i64* %p + %target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0 + %target0 = load i8*, i8** %target.i0 + indirectbr i8* %target0, [label %bb1, label %bb3] + +bb0: + store volatile i64 0, i64* %sink + br label %latch + +bb1: + store volatile i64 1, i64* %sink + br label %latch + +bb2: + store volatile i64 2, i64* %sink + br label %latch + +bb3: + store volatile i64 3, i64* %sink + br label %latch + +bb4: + store volatile i64 4, i64* %sink + br label %latch + +bb5: + store volatile i64 5, i64* %sink + br label %latch + +bb6: + store volatile i64 6, i64* %sink + br label %latch + +bb7: + store volatile i64 7, i64* %sink + br label %latch + +bb8: + store volatile i64 8, i64* %sink + br label %latch + +bb9: + store volatile i64 9, i64* %sink + br label %latch + +latch: + %i.next = load i64, i64* %p + %target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next + %target.next = load i8*, i8** %target.i.next + ; Potentially hit a full 10 successors here so that even if we rewrite as + ; a switch it will try to be lowered with a jump table. + indirectbr i8* %target.next, [label %bb0, + label %bb1, + label %bb2, + label %bb3, + label %bb4, + label %bb5, + label %bb6, + label %bb7, + label %bb8, + label %bb9] +} + +; Lastly check that the necessary thunks were emitted. +; +; X64-LABEL: .section .text.__llvm_retpoline_r11,{{.*}},__llvm_retpoline_r11,comdat +; X64-NEXT: .hidden __llvm_retpoline_r11 +; X64-NEXT: .weak __llvm_retpoline_r11 +; X64: __llvm_retpoline_r11: +; X64-NEXT: # {{.*}} # %entry +; X64-NEXT: callq [[CALL_TARGET:.*]] +; X64-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X64-NEXT: # %entry +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: pause +; X64-NEXT: lfence +; X64-NEXT: jmp [[CAPTURE_SPEC]] +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: [[CALL_TARGET]]: # Block address taken +; X64-NEXT: # %entry +; X64-NEXT: movq %r11, (%rsp) +; X64-NEXT: retq +; +; X86-LABEL: .section .text.__llvm_retpoline_eax,{{.*}},__llvm_retpoline_eax,comdat +; X86-NEXT: .hidden __llvm_retpoline_eax +; X86-NEXT: .weak __llvm_retpoline_eax +; X86: __llvm_retpoline_eax: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_ecx,{{.*}},__llvm_retpoline_ecx,comdat +; X86-NEXT: .hidden __llvm_retpoline_ecx +; X86-NEXT: .weak __llvm_retpoline_ecx +; X86: __llvm_retpoline_ecx: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %ecx, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_edx,{{.*}},__llvm_retpoline_edx,comdat +; X86-NEXT: .hidden __llvm_retpoline_edx +; X86-NEXT: .weak __llvm_retpoline_edx +; X86: __llvm_retpoline_edx: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %edx, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat +; X86-NEXT: .hidden __llvm_retpoline_push +; X86-NEXT: .weak __llvm_retpoline_push +; X86: __llvm_retpoline_push: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: addl $4, %esp +; X86-NEXT: pushl 4(%esp) +; X86-NEXT: pushl 4(%esp) +; X86-NEXT: popl 8(%esp) +; X86-NEXT: popl (%esp) +; X86-NEXT: retl + + +attributes #0 = { "target-features"="+retpoline" } +attributes #1 = { nonlazybind } Index: llvm/test/Transforms/IndirectBrExpand/basic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IndirectBrExpand/basic.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -indirectbr-expand -S | FileCheck %s +; +; REQUIRES: x86-registered-target + +target triple = "x86_64-unknown-linux-gnu" + +@test1.targets = constant [4 x i8*] [i8* blockaddress(@test1, %bb0), + i8* blockaddress(@test1, %bb1), + i8* blockaddress(@test1, %bb2), + i8* blockaddress(@test1, %bb3)] +; CHECK-LABEL: @test1.targets = constant [4 x i8*] +; CHECK: [i8* inttoptr (i64 1 to i8*), +; CHECK: i8* inttoptr (i64 2 to i8*), +; CHECK: i8* inttoptr (i64 3 to i8*), +; CHECK: i8* blockaddress(@test1, %bb3)] + +define void @test1(i64* readonly %p, i64* %sink) #0 { +; CHECK-LABEL: define void @test1( +entry: + %i0 = load i64, i64* %p + %target.i0 = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i0 + %target0 = load i8*, i8** %target.i0 + ; Only a subset of blocks are viable successors here. + indirectbr i8* %target0, [label %bb0, label %bb1] +; CHECK-NOT: indirectbr +; CHECK: %[[ENTRY_V:.*]] = ptrtoint i8* %{{.*}} to i64 +; CHECK-NEXT: br label %[[SWITCH_BB:.*]] + +bb0: + store volatile i64 0, i64* %sink + br label %latch + +bb1: + store volatile i64 1, i64* %sink + br label %latch + +bb2: + store volatile i64 2, i64* %sink + br label %latch + +bb3: + store volatile i64 3, i64* %sink + br label %latch + +latch: + %i.next = load i64, i64* %p + %target.i.next = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i.next + %target.next = load i8*, i8** %target.i.next + ; A different subset of blocks are viable successors here. + indirectbr i8* %target.next, [label %bb1, label %bb2] +; CHECK-NOT: indirectbr +; CHECK: %[[LATCH_V:.*]] = ptrtoint i8* %{{.*}} to i64 +; CHECK-NEXT: br label %[[SWITCH_BB]] +; +; CHECK: [[SWITCH_BB]]: +; CHECK-NEXT: %[[V:.*]] = phi i64 [ %[[ENTRY_V]], %entry ], [ %[[LATCH_V]], %latch ] +; CHECK-NEXT: switch i64 %[[V]], label %bb0 [ +; CHECK-NEXT: i64 2, label %bb1 +; CHECK-NEXT: i64 3, label %bb2 +; CHECK-NEXT: ] +} + +attributes #0 = { "target-features"="+retpoline" } Index: llvm/tools/opt/opt.cpp =================================================================== --- llvm/tools/opt/opt.cpp +++ llvm/tools/opt/opt.cpp @@ -402,6 +402,7 @@ initializeSjLjEHPreparePass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeGlobalMergePass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeEntryExitInstrumenterPass(Registry); initializePostInlineEntryExitInstrumenterPass(Registry);