Index: tools/llvm-canon/CMakeLists.txt =================================================================== --- /dev/null +++ tools/llvm-canon/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS + Core + IRReader + Support + ) + +add_llvm_tool(llvm-canon + llvm-canon.cpp + canonicalizer.cpp + ) \ No newline at end of file Index: tools/llvm-canon/LLVMBuild.txt =================================================================== --- /dev/null +++ tools/llvm-canon/LLVMBuild.txt @@ -0,0 +1,20 @@ +;===- ./tools/llvm-canon/LLVMBuild.txt --------------------------*- Conf -*--===; +; +; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-canon +parent = Tools \ No newline at end of file Index: tools/llvm-canon/canonicalizer.h =================================================================== --- /dev/null +++ tools/llvm-canon/canonicalizer.h @@ -0,0 +1,101 @@ +//===-- canonicalizer.h - Canonicalizer class -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This header declares the Canonicalizer class which aims to transform LLVM +/// Modules into a canonical form by reordering and renaming instructions while +/// preserving the same semantics. This tool makes it easier to spot semantic +/// differences while diffing two modules which have undergone different passes. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_CANON_CANONICALIZER_H +#define LLVM_TOOLS_LLVM_CANON_CANONICALIZER_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include + +using namespace llvm; + +namespace llvm { + +/// Canonicalizer aims to transform LLVM IR into canonical form. +class Canonicalizer : public ModulePass { +public: + static char ID; + + /// Default constructor for the Canonicalizer. Sets default canonicalization + /// flags: + /// renameAll: false - Renames only unnamed instructions. + /// foldPreoutputs: false - Does not fold pre-output instructions. + /// + /// \see renameAll + /// \see foldPreoutputs + Canonicalizer() + : ModulePass(ID), + renameAll(false), + foldPreoutputs(false) {} + + /// Constructor for the Canonicalizer. + /// + /// \see renameAll + /// \see foldPreoutputs + /// \param rename When true, renames all instructions including named ones. + /// \param fold When true, doesn't fold pre-output instructions. + Canonicalizer(bool rename, bool fold) + : ModulePass(ID), + renameAll(rename), + foldPreoutputs(fold) {} + + bool runOnModule(Module &module) override; + +private: + /// \name Canonicalizer flags. + /// @{ + + /// Renames all instructions including named ones. + bool renameAll; + /// Folds all regular instructions (including pre-outputs). + bool foldPreoutputs; + /// @} + + /// \name Naming. + /// @{ + void nameFunctionArguments(Function &F); + void nameBasicBlocks(Function &F); + void nameInstructions(SmallVector &Outputs); + void nameInstruction(Instruction *I); + void nameAsInitialInstruction(Instruction *I); + void nameAsRegularInstruction(Instruction *I); + void foldInstructionName(Instruction *I); + /// @} + + /// \name Reordering instructions. + /// @{ + void reorderInstructions(SmallVector &Outputs); + void reorderInstruction(Instruction *Used, Instruction *User, + SmallPtrSet &Visited); + void reorderInstructionOperandsByNames(Instruction *I); + /// @} + + /// \name Utility methods. + /// @{ + SmallVector collectOutputInstructions(Function &F); + bool isOutput(const Instruction *I); + bool isInitialInstruction(const Instruction *I); + bool hasOnlyImmediateOperands(const Instruction *I); + SetVector getOutputFootprint(Instruction *I, + SmallPtrSet &Visited); + /// @} +}; +} + +#endif Index: tools/llvm-canon/canonicalizer.cpp =================================================================== --- /dev/null +++ tools/llvm-canon/canonicalizer.cpp @@ -0,0 +1,600 @@ +//===-- canonicalizer.cpp - Canonicalizer class -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the Canonicalizer class which aims to transform LLVM +/// Modules into a canonical form by reordering and renaming instructions while +/// preserving the same semantics. This tool makes it easier to spot semantic +/// differences while diffing two modules which have undergone different passes. +/// +//===----------------------------------------------------------------------===// + +#include "canonicalizer.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "canonicalizer" + +namespace llvm { + +char Canonicalizer::ID = 0; + +/// Entry method to the Canonicalizer. +/// +/// \param M Module to canonicalize. +bool Canonicalizer::runOnModule(Module &M) { + for (auto &F : M) { + + nameFunctionArguments(F); + + nameBasicBlocks(F); + + SmallVector Outputs = collectOutputInstructions(F); + + reorderInstructions(Outputs); + + nameInstructions(Outputs); + + for (auto &B : F) { + for (auto &I : B) { + if (I.isCommutative()) { + reorderInstructionOperandsByNames(&I); + } + + foldInstructionName(&I); + } + } + } + + return true; +} + +/// Numbers arguments. +/// +/// \param F Function whose arguments will be renamed. +void Canonicalizer::nameFunctionArguments(Function &F) { + int ArgumentCounter = 0; + + for (auto &A : F.args()) { + if (renameAll || A.getName().empty()) { + // Argument has no name or renameAll flag is used. + A.setName("a" + Twine(ArgumentCounter)); + ++ArgumentCounter; + } + } +} + +/// Names basic blocks using a generated hash for each basic block in +/// a function considering the opcode and the order of output instructions. +/// +/// \param F Function containing basic blocks to rename. +void Canonicalizer::nameBasicBlocks(Function &F) { + for (auto &B : F) { + // Initialize to a random constant, so the state isn't zero. + uint64_t Hash = 0x6acaa36bef8325c5ULL; + + // Hash considering output instruction opcodes. + for (auto &I : B) { + if (isOutput(&I)) { + Hash = hashing::detail::hash_16_bytes(Hash, I.getOpcode()); + } + } + + if (renameAll || B.getName().empty()) { + // Name basic block. Substring hash to make diffs more readable. + B.setName("bb" + std::to_string(Hash).substr(0, 5)); + } + } +} + +/// Wrapper method for naming instructions. Iterates over a vector of outputs, +/// side-effecting instructins or ReturnInst, and renames each of them +/// using nameInstruction(). +/// +/// \see nameInstruction() +/// \param Outputs A vector of output instructions (side-effecting instructions or +/// ReturnInst). +void Canonicalizer::nameInstructions(SmallVector &Outputs) { + + for (auto &I : Outputs) { + nameInstruction(I); + } +} + +/// Names instructions graphically (recursive) in accordance with the +/// def-use tree, starting from the initial instructions (defs), finishing at +/// the output (top-most user) instructions (depth-first). +/// +/// \param I Instruction to be renamed. +void Canonicalizer::nameInstruction(Instruction *I) { + // Determine the type of instruction to name. + + if (isInitialInstruction(I)) { + // This is an initial instruction. + nameAsInitialInstruction(I); + } + else { + // This must be a regular instruction. + nameAsRegularInstruction(I); + } + +} + +/// Names instruction following the scheme: +/// vl00000Calle(Operands) +/// +/// Where 00000 is a hash calculated considering instruction's opcode and output +/// footprint. Calle's name is only included when instruction's type is CallInst. +/// In cases where instruction is commutative, operands list is also sorted. +/// +/// Renames instruction only when RenameAll flag is raised or instruction is unnamed. +/// +/// \see getOutputFootprint() +/// \param I Instruction to be renamed. +void Canonicalizer::nameAsInitialInstruction(Instruction *I) { + // Instruction operands for further sorting. + SmallVector Operands; + + // Collect operands. + for (auto &OP : I->operands()) { + if (isa(OP) && !isa(OP)) { + std::string TextRepresentation; + raw_string_ostream Stream(TextRepresentation); + OP->printAsOperand(Stream, false); + Operands.push_back(Stream.str()); + } + } + + if (I->isCommutative()) { + // This instruction is commutative, sort operands. + std::sort(Operands.begin(), Operands.end()); + } + + std::string OperandList = std::accumulate + (Operands.begin(), Operands.end(), std::string(), + [](const std::string& a, const std::string& b) -> std::string { + return a + (a.length() > 0 ? ", " : "") + b; + }); + + + // Initialize to a random constant, so the state isn't zero. + uint64_t Hash = 0x6acaa36bef8325c5ULL; + + // Consider instruction's opcode in the hash. + Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode()); + + SmallPtrSet Visited; + // Get output footprint for I. + SetVector outputFootprint = getOutputFootprint(I, Visited); + + // Consider output footprint in the hash. + for (auto output : outputFootprint) { + Hash = hashing::detail::hash_16_bytes(Hash, output); + } + + // Base instruction name. + std::string Name = "vl" + std::to_string(Hash).substr(0, 5); + + // In case of CallInst, consider callee in the instruction name. + if (auto CI = dyn_cast(I)) { + Function *F = CI->getCalledFunction(); + + if (F != nullptr) { + Name.append(F->getName()); + } + } + + Name.append("(" + OperandList + ")"); + + if ((I->getName().empty() || renameAll) && !I->getType()->isVoidTy()) { + // Instruction is unnamed or renameAll flag is raised. + I->setName(Name); + } +} + +/// Names instruction following the scheme: +/// op00000Calle(Operands) +/// +/// Where 00000 is a hash calculated considering instruction's opcode, its +/// operands' opcodes and order. Calle's name is only included when instruction's +/// type is CallInst. In cases where instruction is commutative, operand list is +/// also sorted. +/// +/// Names instructions recursively in accordance with the def-use tree, +/// starting from the initial instructions (defs), finishing at +/// the output (top-most user) instructions (depth-first). +/// +/// Renames instruction only when RenameAll flag is raised or instruction is unnamed. +/// +/// \see getOutputFootprint() +/// \param I Instruction to be renamed. +void Canonicalizer::nameAsRegularInstruction(Instruction *I) { + // Instruction operands for further sorting. + SmallVector Operands; + + // The name of a regular instruction depends + // on the names of its operands. Hence, all + // operands must be named first in the use-def + // walk. + + // Collect operands. + for (auto &OP : I->operands()) { + if (auto IOP = dyn_cast(OP)) { + // This is an an instruction. + + // Walk down the use-def chain. + nameInstruction(IOP); + Operands.push_back(IOP->getName()); + } + else if (isa(OP) && !isa(OP)) { + // This must be an immediate value. + + std::string TextRepresentation; + raw_string_ostream Stream(TextRepresentation); + OP->printAsOperand(Stream, false); + Operands.push_back(Stream.str()); + } + + } + + if (I->isCommutative()) { + // This instruction is commutative, sort operands. + std::sort(Operands.begin(), Operands.end()); + } + + std::string OperandList = std::accumulate + (Operands.begin(), Operands.end(), std::string(), + [](const std::string& a, const std::string& b) -> std::string { + return a + (a.length() > 0 ? ", " : "") + b; + }); + + + // Initialize to a random constant, so the state isn't zero. + uint64_t Hash = 0x6acaa36bef8325c5ULL; + + // Consider instruction opcode in the hash. + Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode()); + + + // Operand opcodes for further sorting (commutative). + SmallVector OperandsOpcodes; + + // Collect operand opcodes for hashing. + for (auto &OP : I->operands()) { + if (auto IOP = dyn_cast(OP)) { + OperandsOpcodes.push_back(IOP->getOpcode()); + } + } + + if (I->isCommutative()) { + // This instruction is commutative, sort operand opcodes for hashing. + std::sort(OperandsOpcodes.begin(), OperandsOpcodes.end()); + } + + // Consider operand opcodes in the hash. + for (auto Code : OperandsOpcodes) { + Hash = hashing::detail::hash_16_bytes(Hash, Code); + } + + + // Base instruction name. + std::string Name = "op" + std::to_string(Hash).substr(0, 5); + + // In case of CallInst, consider callee in the instruction name. + if (auto CI = dyn_cast(I)) { + Function *F = CI->getCalledFunction(); + + if (F != nullptr) { + Name.append(F->getName()); + } + } + + Name.append("(" + OperandList + ")"); + + if ((I->getName().empty() || renameAll) && !I->getType()->isVoidTy()) { + // Instruction is unnamed or renameAll flag is raised. + I->setName(Name); + } +} + +/// Shortens instruction's name. This method removes called function name from +/// the instruction name and substitutes the call chain with a corresponding +/// list of operands. +/// +/// Examples: +/// op00000Calle(op00001Calle(...), vl00000Calle(1, 2), ...) -> op00000(op00001, vl00000, ...) +/// vl00000Calle(1, 2) -> vl00000(1, 2) +/// +/// This method omits output instructions and pre-output (instructions directly used +/// by an output instruction) instructions (by default). By default it also does not +/// affect user named instructions. +/// +/// \param I Instruction whose name will be folded. +void Canonicalizer::foldInstructionName(Instruction *I) { + + // If this flas is raised, fold all regular + // instructions (including pre-outputs). + if (!foldPreoutputs) { + + // Don't fold if one of the users is an output instruction. + for (auto U : I->users()) { + if (auto IU = dyn_cast(U)) { + if (isOutput(IU)) { + return; + } + } + } + } + + // Don't fold if it is an output instruction or has no op prefix. + if (isOutput(I) || I->getName().substr(0, 2) != "op") { + return; + } + + // Instruction operands. + SmallVector Operands; + + for (auto &OP : I->operands()) { + if (auto IOP = dyn_cast(OP)) { + if (I->getName().substr(0, 2) == "op" || I->getName().substr(0, 2) == "vl") { + // Regular/initial instruction with canonicalized name. + Operands.push_back(IOP->getName().substr(0, 7)); + } + else { + // User-named instruction, don't substring. + Operands.push_back(IOP->getName()); + } + } + } + + if (I->isCommutative()) { + // This instruction is commutative, sort operands. + std::sort(Operands.begin(), Operands.end()); + } + + std::string OperandList = std::accumulate + (Operands.begin(), Operands.end(), std::string(), + [](const std::string& a, const std::string& b) -> std::string { + return a + (a.length() > 0 ? ", " : "") + b; + }); + + std::string Name = I->getName().substr(0, 7); + Name.append("(" + OperandList + ")"); + I->setName(Name); +} + +/// Reorders instructions by walking up the tree from each operand of an output +/// instruction and reducing the def-use distance. +/// This method assumes that output instructions were collected top-down, +/// otherwise the def-use chain may be broken. +/// This method is a wrapper for recursive reorderInstruction(). +/// +/// \see reorderInstruction() +/// \param Outputs Vector of pointers to output instructions collected top-down. +void Canonicalizer::reorderInstructions(SmallVector &Outputs) { + // This method assumes output instructions were collected top-down, + // otherwise the def-use chain may be broken. + + SmallPtrSet Visited; + + // Walk up the tree. + for (auto &I : Outputs) { + for (auto &OP : I->operands()) { + if (auto IOP = dyn_cast(OP)) { + reorderInstruction(IOP, I, Visited); + } + } + } +} + +/// Reduces def-use distance or places instruction at the end of the basic block. +/// Continues to walk up the def-use tree recursively. +/// Used by reorderInstructions(). +/// +/// \see reorderInstructions() +/// \param Used Pointer to the instruction whose value is used by the \p User. +/// \param User Pointer to the instruction which uses the \p Used. +/// \param Visited Set of visited instructions. +void Canonicalizer::reorderInstruction(Instruction *Used, Instruction *User, + SmallPtrSet &Visited) { + + if (!Visited.count(Used)) { + + Visited.insert(Used); + + if (Used->getParent() == User->getParent()) { + // If Used and User share the same basic block move Used just before User. + Used->moveBefore(User); + } + else { + // Otherwise move Used to the very end of its basic block. + Used->moveBefore(&Used->getParent()->back()); + } + + for (auto &OP : Used->operands()) { + if (auto IOP = dyn_cast(OP)) { + // Walk up the def-use tree. + reorderInstruction(IOP, Used, Visited); + } + } + } +} + +/// Reorders instruction's operands alphabetically. This method assumes +/// that passed instruction is commutative. Changing the operand order +/// in other instructions may change the semantics. +/// +/// \param I Instruction whose operands will be reordered. +void Canonicalizer::reorderInstructionOperandsByNames(Instruction *I) { + // This method assumes that passed I is commutative, + // changing the order of operands in other instructions + // may change the semantics. + + // Instruction operands for further sorting. + SmallVector, 4> Operands; + + //Collect operands. + for (auto &OP : I->operands()) { + if (auto VOP = dyn_cast(OP)) { + if (isa(VOP)) { + // This is an an instruction. + Operands.push_back(std::pair(VOP->getName(), VOP)); + } + else { + std::string TextRepresentation; + raw_string_ostream Stream(TextRepresentation); + OP->printAsOperand(Stream, false); + Operands.push_back(std::pair(Stream.str(), VOP)); + } + } + } + + // Sort operands. + std::sort(Operands.begin(), Operands.end(), [] + (const std::pair &lhs, const std::pair &rhs) + { + return lhs.first < rhs.first; + }); + + // Reorder operands. + unsigned position = 0; + for (auto &OP : I->operands()) { + OP.set(Operands[position].second); + position++; + } +} + +/// Returns a vector of output instructions. An output is an instruction which +/// has side-effects or is ReturnInst. Uses isOutput(). +/// +/// \see isOutput() +/// \param F Function to collect outputs from. +SmallVector Canonicalizer::collectOutputInstructions(Function &F) { + // Output instructions are collected top-down in each function, + // any change may break the def-use chain in reordering methods. + SmallVector Outputs; + + for (auto &B : F) { + for (auto &I : B) { + if (isOutput(&I)) { + Outputs.push_back(&I); + } + } + } + + return Outputs; +} + +/// Helper method checking whether the instruction may have side effects or is +/// ReturnInst. +/// +/// \param I Considered instruction. +bool Canonicalizer::isOutput(const Instruction *I) { + // Outputs are such instructions which may have side effects or is ReturnInst. + if (I->mayHaveSideEffects() || isa(I)) { + return true; + } + + return false; +} + +/// Helper method checking whether the instruction has users and only +/// immediate operands. +/// +/// \param I Considered instruction. +bool Canonicalizer::isInitialInstruction(const Instruction *I) { + // Initial instructions are such instructions whose values are used by + // other instructions, yet they only depend on immediate values. + if (std::distance(I->user_begin(), I->user_end()) != 0 + && hasOnlyImmediateOperands(I)) { + return true; + } + + return false; +} + +/// Helper method checking whether the instruction has only immediate operands. +/// +/// \param I Considered instruction. +bool Canonicalizer::hasOnlyImmediateOperands(const Instruction *I) { + for (auto &OP : I->operands()) { + if (isa(OP)) { + // Found non-immediate operand (instruction). + return false; + } + } + + return true; +} + +/// Helper method returning indices (distance from the begining of the basic block) +/// of outputs using the \p I (eliminates repetitions). Walks down the def-use +/// tree recursively. +/// +/// \param I Considered instruction. +/// \param Visited Set of visited instructions. +SetVector Canonicalizer::getOutputFootprint(Instruction *I, + SmallPtrSet &Visited) { + + // Vector containing indexes of outputs (no repetitions), + // which use I in the order of walking down the def-use tree. + SetVector Outputs; + + if (!Visited.count(I)) { + Visited.insert(I); + + if (isOutput(I)) { + // Gets output instruction's parent function. + Function *Func = I->getParent()->getParent(); + + // Finds and inserts the index of the output to the vector. + unsigned count = 0; + for (auto &B : *Func) { + for (auto &E : B) { + if (&E == I) { + Outputs.insert(count); + } + count++; + } + } + + // Returns to the used instruction. + return Outputs; + } + + for (auto U : I->users()) { + if (auto UI = dyn_cast(U)) { + // Vector for outputs which use UI. + SetVector OutputsUsingUI = getOutputFootprint(UI, Visited); + + // Insert the indexes of outputs using UI. + Outputs.insert(OutputsUsingUI.begin(), OutputsUsingUI.end()); + } + } + } + + // Return to the used instruction. + return Outputs; +} + +static RegisterPass X("canonicalizer", "Canonicalizer pass"); + +} + + + + Index: tools/llvm-canon/llvm-canon.cpp =================================================================== --- /dev/null +++ tools/llvm-canon/llvm-canon.cpp @@ -0,0 +1,83 @@ +//===-- llvm-canon.cpp - Canonicalizer ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the command line driver for the Canonicalizer. +/// +/// \see Canonicalizer +/// +//===----------------------------------------------------------------------===// + +#include "canonicalizer.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/SourceMgr.h" +#include +#include +#include + +using namespace llvm; + +/// Reads a module from a file. +/// On error, messages are written to stderr and null is returned. +static std::unique_ptr readModule(LLVMContext &Context, StringRef Name) { + SMDiagnostic Diag; + std::unique_ptr M = parseIRFile(Name, Diag, Context); + + if (!M) { + Diag.print("llvm-canon", errs()); + } + + return M; +} + +cl::opt InputFilename("f", cl::desc("Specify input filename"), + cl::value_desc("filename"), cl::Required); + +cl::opt OutputFilename("o", cl::desc("Specify output filename"), + cl::value_desc("filename"), cl::Required); + +cl::opt RenameAll("rename-all", cl::desc("Rename all instructions (including unnamed)")); +cl::opt FoldPreoutputs("fold-all", cl::desc("Folds all regular instructions (including pre-outputs)")); + + +int main(int argc, char **argv) { + cl::ParseCommandLineOptions(argc, argv, " LLVM-Canon\n\n" + " This tool aims to transform LLVM Modules into canonical form by" + " reordering and renaming instructions while preserving the same" + " semantics. Making it easier to spot semantic differences while" + " diffing two modules which have undergone different passes.\n"); + + LLVMContext Context; + + std::unique_ptr InputModule = readModule(Context, InputFilename); + + if (!InputModule) { + return 1; + } + + Canonicalizer canon(RenameAll, FoldPreoutputs); + canon.runOnModule(*InputModule.get()); + + std::string Dump; + raw_string_ostream DumpStream(Dump); + InputModule.get()->print(DumpStream, false); + + std::ofstream FileStream; + FileStream.open(OutputFilename.c_str()); + + FileStream << DumpStream.str(); + + FileStream.close(); + + return 0; +} + + + +