Index: clang/lib/Driver/Tools.cpp =================================================================== --- clang/lib/Driver/Tools.cpp +++ clang/lib/Driver/Tools.cpp @@ -5059,6 +5059,7 @@ case llvm::Triple::x86_64: case llvm::Triple::arm: case llvm::Triple::aarch64: + case llvm::Triple::ppc64le: // Supported. break; default: Index: compiler-rt/cmake/config-ix.cmake =================================================================== --- compiler-rt/cmake/config-ix.cmake +++ compiler-rt/cmake/config-ix.cmake @@ -175,7 +175,7 @@ set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64}) set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64}) set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}) -set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64}) +set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${PPC64}) if(APPLE) include(CompilerRTDarwinUtils) Index: compiler-rt/lib/xray/CMakeLists.txt =================================================================== --- compiler-rt/lib/xray/CMakeLists.txt +++ compiler-rt/lib/xray/CMakeLists.txt @@ -29,6 +29,12 @@ xray_trampoline_AArch64.S ${XRAY_SOURCES}) +set(powerpc64le_SOURCES + xray_powerpc64.cc + xray_trampoline_powerpc64.cc + xray_trampoline_powerpc64.S + ${XRAY_SOURCES}) + include_directories(..) include_directories(../../include) Index: compiler-rt/lib/xray/xray_fdr_logging.cc =================================================================== --- compiler-rt/lib/xray/xray_fdr_logging.cc +++ compiler-rt/lib/xray/xray_fdr_logging.cc @@ -36,7 +36,7 @@ #if defined(__x86_64__) #include "xray_x86_64.h" -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) || defined(__powerpc64__) #include "xray_emulate_tsc.h" #else #error "Unsupported CPU Architecture" Index: compiler-rt/lib/xray/xray_inmemory_log.cc =================================================================== --- compiler-rt/lib/xray/xray_inmemory_log.cc +++ compiler-rt/lib/xray/xray_inmemory_log.cc @@ -26,7 +26,7 @@ #if defined(__x86_64__) #include "xray_x86_64.h" -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) || defined(__powerpc64__) #include "xray_emulate_tsc.h" #else #error "Unsupported CPU Architecture" Index: compiler-rt/lib/xray/xray_interface.cc =================================================================== --- compiler-rt/lib/xray/xray_interface.cc +++ compiler-rt/lib/xray/xray_interface.cc @@ -35,6 +35,8 @@ static const int16_t cSledLength = 32; #elif defined(__arm__) static const int16_t cSledLength = 28; +#elif defined(__powerpc64__) +static const int16_t cSledLength = 8; #else #error "Unsupported CPU Architecture" #endif /* CPU architecture */ Index: compiler-rt/lib/xray/xray_powerpc64.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_powerpc64.cc @@ -0,0 +1,100 @@ +//===-- xray_AArch64.cc -----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of powerpc64 and powerpc64le routines. +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_emulate_tsc.h" +#include "xray_interface_internal.h" +#include "xray_utils.h" +#include +#include +#include + +#ifndef __LITTLE_ENDIAN__ +#error powerpc64 big endian is not supported for now. +#endif + +namespace { + +constexpr unsigned long long JumpOverInstNum = 7; + +void clearCache(void *Addr, size_t Len) { + const size_t LineSize = 32; + + const intptr_t Mask = ~(LineSize - 1); + const intptr_t StartLine = ((intptr_t)Addr) & Mask; + const intptr_t EndLine = ((intptr_t)Addr + Len + LineSize - 1) & Mask; + + for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize) + asm volatile("dcbf 0, %0" : : "r"(Line)); + asm volatile("sync"); + + for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize) + asm volatile("icbi 0, %0" : : "r"(Line)); + asm volatile("isync"); +} + +} // namespace + +extern "C" void __clear_cache(void *start, void *end); + +namespace __xray { + +uint64_t cycleFrequency() XRAY_NEVER_INSTRUMENT { + return NanosecondsPerSecond; +} + +bool patchFunctionEntry(const bool Enable, uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + if (Enable) { + // lis 0, FuncId[16..32] + // li 0, FuncId[0..15] + *reinterpret_cast(Sled.Address) = + (0x3c000000ull + (FuncId >> 16)) + + ((0x60000000ull + (FuncId & 0xffff)) << 32); + } else { + // b +JumpOverInstNum instructions. + *reinterpret_cast(Sled.Address) = + 0x48000000ull + (JumpOverInstNum << 2); + } + clearCache(reinterpret_cast(Sled.Address), 8); + return true; +} + +bool patchFunctionExit(const bool Enable, uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + if (Enable) { + // lis 0, FuncId[16..32] + // li 0, FuncId[0..15] + *reinterpret_cast(Sled.Address) = + (0x3c000000ull + (FuncId >> 16)) + + ((0x60000000ull + (FuncId & 0xffff)) << 32); + } else { + // Copy the blr/b instruction after JumpOverInstNum instructions. + *reinterpret_cast(Sled.Address) = + *(reinterpret_cast(Sled.Address) + JumpOverInstNum); + } + clearCache(reinterpret_cast(Sled.Address), 8); + return true; +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchFunctionExit(Enable, FuncId, Sled); +} + +// FIXME: Maybe implement this better? +bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + +} // namespace __xray Index: compiler-rt/lib/xray/xray_trampoline_powerpc64.S =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_trampoline_powerpc64.S @@ -0,0 +1,171 @@ + .text + .abiversion 2 + .globl __xray_FunctionEntry + .p2align 4 +__xray_FunctionEntry: + std 0, 16(1) + stdu 1, -408(1) +# Spill r3-r10, f1-f13, and vsr34-vsr45, which are parameter registers. +# If this appears to be slow, the caller needs to pass in number of generic, +# floating point, and vector parameters, so that we only spill those live ones. + std 3, 32(1) + ld 3, 400(1) # FuncId + std 4, 40(1) + std 5, 48(1) + std 6, 56(1) + std 7, 64(1) + std 8, 72(1) + std 9, 80(1) + std 10, 88(1) + addi 4, 1, 96 + stxsdx 1, 0, 4 + addi 4, 1, 104 + stxsdx 2, 0, 4 + addi 4, 1, 112 + stxsdx 3, 0, 4 + addi 4, 1, 120 + stxsdx 4, 0, 4 + addi 4, 1, 128 + stxsdx 5, 0, 4 + addi 4, 1, 136 + stxsdx 6, 0, 4 + addi 4, 1, 144 + stxsdx 7, 0, 4 + addi 4, 1, 152 + stxsdx 8, 0, 4 + addi 4, 1, 160 + stxsdx 9, 0, 4 + addi 4, 1, 168 + stxsdx 10, 0, 4 + addi 4, 1, 176 + stxsdx 11, 0, 4 + addi 4, 1, 184 + stxsdx 12, 0, 4 + addi 4, 1, 192 + stxsdx 13, 0, 4 + addi 4, 1, 200 + stxvd2x 34, 0, 4 + addi 4, 1, 216 + stxvd2x 35, 0, 4 + addi 4, 1, 232 + stxvd2x 36, 0, 4 + addi 4, 1, 248 + stxvd2x 37, 0, 4 + addi 4, 1, 264 + stxvd2x 38, 0, 4 + addi 4, 1, 280 + stxvd2x 39, 0, 4 + addi 4, 1, 296 + stxvd2x 40, 0, 4 + addi 4, 1, 312 + stxvd2x 41, 0, 4 + addi 4, 1, 328 + stxvd2x 42, 0, 4 + addi 4, 1, 344 + stxvd2x 43, 0, 4 + addi 4, 1, 360 + stxvd2x 44, 0, 4 + addi 4, 1, 376 + stxvd2x 45, 0, 4 + std 2, 392(1) + mflr 0 + std 0, 400(1) + + li 4, 0 + bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType + nop + + addi 4, 1, 96 + lxsdx 1, 0, 4 + addi 4, 1, 104 + lxsdx 2, 0, 4 + addi 4, 1, 112 + lxsdx 3, 0, 4 + addi 4, 1, 120 + lxsdx 4, 0, 4 + addi 4, 1, 128 + lxsdx 5, 0, 4 + addi 4, 1, 136 + lxsdx 6, 0, 4 + addi 4, 1, 144 + lxsdx 7, 0, 4 + addi 4, 1, 152 + lxsdx 8, 0, 4 + addi 4, 1, 160 + lxsdx 9, 0, 4 + addi 4, 1, 168 + lxsdx 10, 0, 4 + addi 4, 1, 176 + lxsdx 11, 0, 4 + addi 4, 1, 184 + lxsdx 12, 0, 4 + addi 4, 1, 192 + lxsdx 13, 0, 4 + addi 4, 1, 200 + lxvd2x 34, 0, 4 + addi 4, 1, 216 + lxvd2x 35, 0, 4 + addi 4, 1, 232 + lxvd2x 36, 0, 4 + addi 4, 1, 248 + lxvd2x 37, 0, 4 + addi 4, 1, 264 + lxvd2x 38, 0, 4 + addi 4, 1, 280 + lxvd2x 39, 0, 4 + addi 4, 1, 296 + lxvd2x 40, 0, 4 + addi 4, 1, 312 + lxvd2x 41, 0, 4 + addi 4, 1, 328 + lxvd2x 42, 0, 4 + addi 4, 1, 344 + lxvd2x 43, 0, 4 + addi 4, 1, 360 + lxvd2x 44, 0, 4 + addi 4, 1, 376 + lxvd2x 45, 0, 4 + ld 0, 400(1) + mtlr 0 + ld 2, 392(1) + ld 3, 32(1) + ld 4, 40(1) + ld 5, 48(1) + ld 6, 56(1) + ld 7, 64(1) + ld 8, 72(1) + ld 9, 80(1) + ld 10, 88(1) + + addi 1, 1, 408 + ld 0, 16(1) + blr + + .globl __xray_FunctionExit + .p2align 4 +__xray_FunctionExit: + std 0, 16(1) + ld 0, -8(1) # FuncId + stdu 1, -72(1) +# Spill r3, f1, and vsr34, the return value registers. + std 3, 32(1) + mr 3, 0 + addi 4, 1, 40 + stxsdx 1, 0, 4 + addi 4, 1, 48 + stxvd2x 34, 0, 4 + mflr 0 + std 0, 64(1) + li 4, 1 + bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType + nop + ld 0, 64(1) + mtlr 0 + ld 3, 32(1) + addi 4, 1, 40 + lxsdx 1, 0, 4 + addi 4, 1, 48 + lxvd2x 34, 0, 4 + addi 1, 1, 72 + ld 0, 16(1) + blr Index: compiler-rt/lib/xray/xray_trampoline_powerpc64.cc =================================================================== --- /dev/null +++ compiler-rt/lib/xray/xray_trampoline_powerpc64.cc @@ -0,0 +1,15 @@ +#include +#include + +namespace __xray { + +extern std::atomic XRayPatchedFunction; + +// Implement this in C++ instead of assembly, to avoid dealing with ToC by hand. +void CallXRayPatchedFunction(int32_t FuncId, XRayEntryType Type) { + auto fptr = __xray::XRayPatchedFunction.load(); + if (fptr != nullptr) + (*fptr)(FuncId, Type); +} + +} // namespace __xray Index: compiler-rt/lib/xray/xray_utils.cc =================================================================== --- compiler-rt/lib/xray/xray_utils.cc +++ compiler-rt/lib/xray/xray_utils.cc @@ -26,7 +26,7 @@ #if defined(__x86_64__) #include "xray_x86_64.h" -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) || defined(__powerpc64__) #include "xray_emulate_tsc.h" #else #error "Unsupported CPU Architecture" @@ -113,7 +113,7 @@ } else { Report("Unable to determine CPU frequency for TSC accounting.\n"); } -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) || defined(__powerpc64__) // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does // not have a constant frequency like TSC on x86(_64), it may go faster // or slower depending on CPU turbo or power saving mode. Furthermore, Index: llvm/lib/CodeGen/XRayInstrumentation.cpp =================================================================== --- llvm/lib/CodeGen/XRayInstrumentation.cpp +++ llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -157,6 +157,7 @@ case Triple::ArchType::arm: case Triple::ArchType::thumb: case Triple::ArchType::aarch64: + case Triple::ArchType::ppc64le: // For the architectures which don't have a single return instruction prependRetWithPatchableExit(MF, TII); break; Index: llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -112,7 +112,9 @@ void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget(); - return AsmPrinter::runOnMachineFunction(MF); + bool Changed = AsmPrinter::runOnMachineFunction(MF); + emitXRayTable(); + return Changed; } }; @@ -134,6 +136,7 @@ void EmitFunctionBodyStart() override; void EmitFunctionBodyEnd() override; + void EmitInstruction(const MachineInstr *MI) override; }; /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac @@ -1046,6 +1049,94 @@ EmitToStreamer(*OutStreamer, TmpInst); } +void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) { + if (!Subtarget->isPPC64()) + return PPCAsmPrinter::EmitInstruction(MI); + + switch (MI->getOpcode()) { + default: + return PPCAsmPrinter::EmitInstruction(MI); + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: { + // .begin: + // b .end # lis 0, FuncId[16..32] + // nop # li 0, FuncId[0..15] + // std 0, -8(1) + // mflr 0 + // bl __xray_FunctionEntry + // mtlr 0 + // .end: + // + // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number + // of instructions change. + MCSymbol *BeginOfSled = OutContext.createTempSymbol(); + MCSymbol *EndOfSled = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(BeginOfSled); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::B).addExpr( + MCSymbolRefExpr::create(EndOfSled, OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0)); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL8_NOP) + .addExpr(MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol("__xray_FunctionEntry"), + OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0)); + OutStreamer->EmitLabel(EndOfSled); + recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER); + break; + } + case TargetOpcode::PATCHABLE_FUNCTION_EXIT: { + // .p2align 3 + // .begin: + // blr # lis 0, FuncId[16..32] + // nop # li 0, FuncId[0..15] + // std 0, -8(1) + // mflr 0 + // bl __xray_FunctionExit + // mtlr 0 + // .end: + // blr + // + // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number + // of instructions change. + const MachineInstr *Next = [&] { + MachineBasicBlock::const_iterator It(MI); + const MachineBasicBlock *MBB = MI->getParent(); + assert(It != MBB->end()); + ++It; + assert(It->isReturn()); + return &*It; + }(); + OutStreamer->EmitCodeAlignment(8); + MCSymbol *BeginOfSled = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(BeginOfSled); + MCInst TmpInst; + LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false); + EmitToStreamer(*OutStreamer, TmpInst); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0)); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL8_NOP) + .addExpr(MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol("__xray_FunctionExit"), + OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0)); + recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT); + break; + } + case TargetOpcode::PATCHABLE_TAIL_CALL: + case TargetOpcode::PATCHABLE_RET: + llvm_unreachable(""); + } +} + void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { if (static_cast(TM).isELFv2ABI()) { PPCTargetStreamer *TS = Index: llvm/lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -65,7 +65,8 @@ void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) - : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, -1, + STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for Index: llvm/lib/Target/PowerPC/PPCSubtarget.h =================================================================== --- llvm/lib/Target/PowerPC/PPCSubtarget.h +++ llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -318,6 +318,8 @@ /// classifyGlobalReference - Classify a global variable reference for the /// current subtarget accourding to how we should reference it. unsigned char classifyGlobalReference(const GlobalValue *GV) const; + + bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } }; } // End llvm namespace Index: llvm/lib/XRay/InstrumentationMap.cpp =================================================================== --- llvm/lib/XRay/InstrumentationMap.cpp +++ llvm/lib/XRay/InstrumentationMap.cpp @@ -55,7 +55,8 @@ // Find the section named "xray_instr_map". if (!ObjFile.getBinary()->isELF() || - ObjFile.getBinary()->getArch() != Triple::x86_64) + !(ObjFile.getBinary()->getArch() == Triple::x86_64 || + ObjFile.getBinary()->getArch() == Triple::ppc64le)) return make_error( "File format not supported (only does ELF little endian 64-bit).", std::make_error_code(std::errc::not_supported));