Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -27,6 +27,7 @@ class TargetTransformInfo; class Type; class Value; +class VectorVariant; namespace Intrinsic { enum ID : unsigned; @@ -123,6 +124,26 @@ /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); +/// \brief Contains the names of the declared vector function variants +typedef std::vector DeclaredVariants; + +/// \brief Contains a mapping of a function to its vector function variants +typedef std::map FunctionVariants; + +/// \brief Get all function attributes that specify a vector variant. +std::vector getVectorVariantAttributes(Function& F); + +/// \brief Determine the characteristic type of the vector function as +/// specified according to the vector function ABI. +Type* calcCharacteristicType(Function& F, VectorVariant& Variant); + +/// \brief Get all functions marked for vectorization in module. +void getFunctionsToVectorize(Module &M, FunctionVariants& funcVars); + +/// \brief Returns a floating point or integer constant depending on Ty. +template +Constant* getConstantValue(Type *Ty, LLVMContext &Context, T Val); + } // llvm namespace #endif Index: include/llvm/Analysis/VectorVariant.h =================================================================== --- include/llvm/Analysis/VectorVariant.h +++ include/llvm/Analysis/VectorVariant.h @@ -0,0 +1,387 @@ +//===- llvm/Transforms/Utils/VectorVariant.h - Vector utilities -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This header file defines the VectorVariant class and implements the encoding +/// and decoding utilities for VectorVariant objects. Multiple VectorVariant +/// objects can be created (masked, non-masked, etc.) and associated with the +/// original scalar function. These objects are then used to clone new functions +/// that can be vectorized. This class follows the standards defined in the +/// vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_VECTORVARIANT_H +#define LLVM_ANALYSIS_VECTORVARIANT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" +#include +#include +#include + +using namespace llvm; + +namespace llvm { + +#define STRIDE_KIND 's' +#define LINEAR_KIND 'l' +#define UNIFORM_KIND 'u' +#define VECTOR_KIND 'v' + +#define NOT_ALIGNED 1 + +#define POSITIVE 1 +#define NEGATIVE -1 + +class VectorKind { + +public: + VectorKind(char K, int S, int A = NOT_ALIGNED) { + + assert((S == notAValue() || K == STRIDE_KIND || K == LINEAR_KIND) && + "only linear vectors have strides"); + + assert((K != LINEAR_KIND || S != notAValue()) && + "linear vectors must have a stride"); + + assert((K != STRIDE_KIND || S != notAValue()) && + "variable stride vectors must have a stride"); + + assert((K != STRIDE_KIND || S >= 0) && + "variable stride position must be non-negative"); + + assert(A > 0 && "alignment must be positive"); + + Kind = K; + Stride = S; + Alignment = A; + } + + VectorKind(const VectorKind &Other) { + Kind = Other.Kind; + Stride = Other.Stride; + Alignment = Other.Alignment; + } + + /// \brief Is the stride for a linear parameter a uniform variable? (i.e., + /// the stride is stored in a variable but is uniform) + bool isVariableStride() { return Kind == STRIDE_KIND; } + + /// \brief Is the stride for a linear variable non-unit stride? + bool isNonUnitStride() { return Kind == LINEAR_KIND && Stride != 1; } + + /// \brief Is the stride for a linear variable unit stride? + bool isUnitStride() { return Kind == LINEAR_KIND && Stride == 1; } + + /// \brief Is this a linear parameter? + bool isLinear() { + return isVariableStride() || isNonUnitStride() || isUnitStride(); + } + + /// \brief Is this a uniform parameter? + bool isUniform() { return Kind == UNIFORM_KIND; } + + /// \brief Is this a vector parameter? + bool isVector() { return Kind == VECTOR_KIND; } + + /// \brief Is the parameter aligned? + bool isAligned() { return Alignment != NOT_ALIGNED; } + + /// \brief Get the stride associated with a linear parameter. + int getStride() { return Stride; } + + /// \brief Get the alignment associated with a linear parameter. + int getAlignment() { return Alignment; } + + /// \brief Represents a don't care value for strides of parameters other + /// than linear parameters. + static int notAValue() { return -1; } + + /// \brief Encode the parameter information into a mangled string + /// corresponding to the standards defined in the vector function ABI. + std::string encode() { + std::stringstream SST; + SST << Kind; + + if (isNonUnitStride()) { + if (Stride >= 0) + SST << Stride; + else + SST << "n" << -Stride; + } + + if (isVariableStride()) + SST << Stride; + + if (isAligned()) + SST << 'a' << Alignment; + + return SST.str(); + } + +private: + char Kind; // linear, uniform, vector + int Stride; + int Alignment; +}; + +class VectorVariant { +private: + // Targets defined in the vector function ABI. + enum TargetProcessor { + Pentium4, // ISA extension = SSE2, ISA class = XMM + Pentium4SSE3, // ISA extension = SSE3, ISA class = XMM + Core2DuoSSSE3, // ISA extension = SSSE3, ISA class = XMM + Core2DuoSSE41, // ISA extension = SSE4_1, ISA class = XMM + CoreI7SSE42, // ISA extension = SSE4_2, ISA class = XMM + Core2ndGenAVX, // ISA extension = AVX, ISA class = YMM1 + Core3rdGenAVX, // ISA extension = AVX, ISA class = YMM1 + Core4thGenAVX, // ISA extension = AVX2, ISA class = YMM2 + Mic, // ISA extension = Xeon Phi, ISA class = MIC(ZMM) + FutureCpu22, // ISA extension = AVX512, ISA class = ZMM + FutureCpu23, // ISA extension = AVX512, ISA class = ZMM + }; + + // ISA classes defined in the vector function ABI. + enum ISAClass { + XMM, // (SSE2) + YMM1, // (AVX1) + YMM2, // (AVX2) + ZMM, // (MIC) + ISAClassesNum + }; + + ISAClass Isa; + bool Mask; + unsigned int Vlen; + std::vector Parameters; + + static std::string prefix() { return "_ZGV"; } + + /// \brief Determine the maximum vector register width based on the ISA classes + /// defined in the vector function ABI. + static unsigned int maximumSizeofISAClassVectorRegister(ISAClass I, Type *Ty); + +public: + VectorVariant(ISAClass I, bool M, unsigned int V, + const std::vector &P) + : Isa(I), Mask(M), Vlen(V), Parameters(P) { + if (Mask) { + // Masked variants will have an additional mask parameter + VectorKind VKind(VECTOR_KIND, VectorKind::notAValue()); + Parameters.push_back(VKind); + } + } + + VectorVariant(const VectorVariant &Other) + : Isa(Other.Isa), Mask(Other.Mask), Vlen(Other.Vlen), + Parameters(Other.Parameters) {} + + VectorVariant(StringRef FuncName); + + /// \brief Get the ISA corresponding to this vector variant. + ISAClass getISA() { return Isa; } + + /// \brief Is this a masked vector function variant? + bool isMasked() { return Mask; } + + /// \brief Get the vector length of the vector variant. + unsigned int getVlen() { return Vlen; } + + /// \brief Get the parameters of the vector variant. + std::vector &getParameters() { return Parameters; } + + /// \brief Build the mangled name for the vector variant. This function + /// builds a mangled name by including the encodings for the ISA class, + /// mask information, and all parameters. + std::string encode() { + + std::stringstream SST; + SST << prefix() << encodeISAClass(Isa) << encodeMask(Mask) << Vlen; + + std::vector::iterator It = Parameters.begin(); + std::vector::iterator End = Parameters.end(); + + if (isMasked()) + End--; // mask parameter is not encoded + + for (; It != End; ++It) + SST << (*It).encode(); + + SST << "_"; + + return SST.str(); + } + + /// \brief Generate a function name corresponding to a vector variant. + std::string generateFunctionName(StringRef ScalarFuncName) { + + static StringRef ManglingPrefix("_Z"); + std::string Name = encode(); + + if (ScalarFuncName.startswith(ManglingPrefix)) + return Name + ScalarFuncName.drop_front(ManglingPrefix.size()).str(); + else + return Name + ScalarFuncName.str(); + } + + /// \brief Some targets do not support particular types, so promote to a type + /// that is supported. + Type *promoteToSupportedType(Type *Ty) { + return promoteToSupportedType(Ty, *this); + } + + /// \brief Check to see if this is a vector variant based on the function + /// name. + static bool isVectorVariant(StringRef FuncName) { + return FuncName.startswith(prefix()); + } + + /// \brief Some targets do not support particular types, so promote to a type + /// that is supported. + static Type *promoteToSupportedType(Type *Ty, VectorVariant &Variant) { + ISAClass I; + + I = Variant.getISA(); + // On ZMM promote char and short to int + if (I == ISAClass::ZMM && (Ty->isIntegerTy(8) || Ty->isIntegerTy(16))) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; + } + + /// \brief Get the ISA class corresponding to a particular target processor. + static ISAClass targetProcessorISAClass(TargetProcessor Target) { + + switch (Target) { + case Pentium4: + case Pentium4SSE3: + case Core2DuoSSSE3: + case Core2DuoSSE41: + case CoreI7SSE42: + return XMM; + case Core2ndGenAVX: + case Core3rdGenAVX: + return YMM1; + case Core4thGenAVX: + return YMM2; + case Mic: + case FutureCpu22: + case FutureCpu23: + return ZMM; + default: + llvm_unreachable("unsupported target processor"); + return XMM; + } + } + + /// \brief Get an ISA class string based on ISA class enum. + static std::string ISAClassToString(ISAClass IsaClass) { + switch (IsaClass) { + case XMM: + return "XMM"; + case YMM1: + return "YMM1"; + case YMM2: + return "YMM2"; + case ZMM: + return "ZMM"; + default: + assert(false && "unsupported ISA class"); + return "?"; + } + } + + /// \brief Get an ISA class enum based on ISA class string. + static ISAClass ISAClassFromString(std::string IsaClass) { + if (IsaClass == "XMM") + return XMM; + if (IsaClass == "YMM1") + return YMM1; + if (IsaClass == "YMM2") + return YMM2; + if (IsaClass == "ZMM") + return ZMM; + assert(false && "unsupported ISA class"); + return ISAClassesNum; + } + + /// \brief Encode the ISA class for the mangled variant name. + static char encodeISAClass(ISAClass IsaClass) { + + switch (IsaClass) { + case XMM: + return 'b'; + case YMM1: + return 'c'; + case YMM2: + return 'd'; + case ZMM: + return 'e'; + default: + break; + } + + assert(false && "unsupported ISA class"); + return '?'; + } + + /// \brief Decode the ISA class from the mangled variant name. + static ISAClass decodeISAClass(char IsaClass) { + + switch (IsaClass) { + case 'b': + return XMM; + case 'c': + return YMM1; + case 'd': + return YMM2; + case 'e': + return ZMM; + default: + llvm_unreachable("unsupported ISA class"); + return XMM; + } + } + + /// \brief Encode the mask information for the mangled variant name. + static char encodeMask(bool EncodeMask) { + + switch (EncodeMask) { + case true: + return 'M'; + case false: + return 'N'; + } + + llvm_unreachable("unsupported mask"); + } + + /// \brief Decode the mask information from the mangled variant name. + static bool decodeMask(char MaskToDecode) { + + switch (MaskToDecode) { + case 'M': + return true; + case 'N': + return false; + } + + llvm_unreachable("unsupported mask"); + } + + /// \brief Calculate the vector length for the vector variant. + static unsigned int calcVlen(ISAClass I, Type *Ty); +}; + +} // llvm namespace + +#endif // LLVM_ANALYSIS_VECTORVARIANT_H Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -338,6 +338,7 @@ void initializeWinEHPreparePass(PassRegistry&); void initializeWriteBitcodePassPass(PassRegistry &); void initializeXRayInstrumentationPass(PassRegistry &); +void initializeVecClonePass(PassRegistry &); } #endif Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -46,6 +46,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/VecClone.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/Support/Valgrind.h" #include @@ -198,6 +199,7 @@ (void) llvm::createMemDerefPrinter(); (void) llvm::createFloat2IntPass(); (void) llvm::createEliminateAvailableExternallyPass(); + (void) llvm::createVecClonePass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); Index: include/llvm/Transforms/Utils/VecClone.h =================================================================== --- include/llvm/Transforms/Utils/VecClone.h +++ include/llvm/Transforms/Utils/VecClone.h @@ -0,0 +1,210 @@ +//===-------------- VecClone.h - Class definition -*- C++ -*---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This file defines the VecClone pass class. +/// +// ===--------------------------------------------------------------------=== // + +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#ifndef LLVM_TRANSFORMS_UTILS_VECCLONE_H +#define LLVM_TRANSFORMS_UTILS_VECCLONE_H + +namespace llvm { + +class ModulePass; + +/// \brief Represents the mapping of a vector parameter to its corresponding +/// vector to scalar type cast instruction. This done so that the scalar loop +/// inserted by this pass contains instructions that are in scalar form so that +/// the loop can later be vectorized. +struct ParmRef { + // Represents the parameter in one of two forms: + // 1) A vector alloca instruction if the parameter has not been registerized. + // 2) The parameter as the Value* passed in via the function call. + Value *VectorParm; + + // Represents the vector parameter cast from a vector type to scalar type. + Instruction *VectorParmCast; +}; + +class VecClone : public ModulePass { + + private: + + /// \brief Return true if the function has a complex type for the return + /// or parameters. + bool hasComplexType(Function *F); + + /// \brief Make a copy of the function if it is marked as SIMD. + Function* cloneFunction(Function &F, VectorVariant &V); + + /// \brief Take the entry basic block for the function as split off a second + /// basic block that will form the loop entry. + BasicBlock* splitEntryIntoLoop(Function *Clone, VectorVariant &V, + BasicBlock *EntryBlock); + + /// \brief Take the loop entry basic block and split off a second basic + /// block into a new return basic block. + BasicBlock* splitLoopIntoReturn(Function *Clone, BasicBlock *LoopBlock); + + /// \brief Generate a basic block to test the loop exit condition. + BasicBlock* createLoopExit(Function *Clone, BasicBlock *ReturnBlock); + + /// \brief Update the predecessors of the return basic block. + void updateReturnPredecessors(Function *Clone, BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create the backedge from the loop exit basic block to the loop + /// entry block. + PHINode* createPhiAndBackedgeForLoop(Function *Clone, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock, + int VL); + + /// \brief Generate vector alloca instructions for vector parameters and + /// change the parameter types to vector types. Expand the return value of + /// the function to a vector type. This function returns the instruction + /// corresponding to the expanded return and the instruction corresponding + /// to the mask. + Instruction* expandVectorParametersAndReturn( + Function *Clone, + VectorVariant &V, + Instruction **Mask, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *ReturnBlock, + std::vector& ParmMap); + + /// \brief Expand the function parameters to vector types. This function + /// returns the instruction corresponding to the mask. + Instruction* expandVectorParameters( + Function *Clone, + VectorVariant &V, + BasicBlock *EntryBlock, + std::vector& ParmMap); + + /// \brief Expand the function's return value to a vector type. + Instruction* expandReturn(Function *Clone, BasicBlock *EntryBlock, + BasicBlock *LoopBlock, BasicBlock *ReturnBlock, + std::vector& ParmMap); + + /// \brief Update the old parameter references to with the new vector + /// references. + void updateScalarMemRefsWithVector( + Function *Clone, + Function &F, + BasicBlock *EntryBlock, + BasicBlock *ReturnBlock, + PHINode *Phi, + std::vector& ParmMap); + + /// \brief Update the values of linear parameters by adding the stride + /// before the use. + void updateLinearReferences(Function *Clone, Function &F, + VectorVariant &V, PHINode *Phi); + + /// \brief Update the instructions in the return basic block to return a + /// vector temp. + void updateReturnBlockInstructions(Function *Clone, BasicBlock *ReturnBlock, + Instruction *VecReturnAlloca); + + /// \brief Create a separate basic block to mark the begin and end of the + /// SIMD loop formed from the vector function. Essentially, this function + /// transfers the information from the SIMD function keywords and creates + /// new loop pragmas so that parameter information can be transferred to + /// the loop. + void insertDirectiveIntrinsics(Module& M, Function *Clone, Function &F, + VectorVariant &V, + BasicBlock *EntryBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create the basic block indicating the begin of the SIMD loop. + void insertBeginRegion(Module& M, Function *Clone, Function &F, + VectorVariant &V, BasicBlock *EntryBlock); + + /// \brief Create the basic block indicating the end of the SIMD loop. + void insertEndRegion(Module& M, Function *Clone, BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create a new vector alloca instruction for the return vector and + /// bitcast to the appropriate element type. + Instruction* createExpandedReturn(Function *F, BasicBlock *BB, + VectorType *ReturnType); + + /// \brief Return the position of the parameter in the function's parameter + /// list. + int getParmIndexInFunction(Function *F, Value *Parm); + + /// \brief Check to see if the function is simple enough that a loop does + /// not need to be inserted into the function. + bool isSimpleFunction(Function *Clone, VectorVariant &V, + ReturnInst *Return); + + /// \brief Inserts the if/else split and mask condition for masked SIMD + /// functions. + void insertSplitForMaskedVariant(Function *Clone, BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi); + + /// \brief Utility function to insert instructions with other instructions + /// of the same kind. + void insertInstruction(Instruction *Inst, BasicBlock *BB); + + /// \brief Utility function that generates instructions that calculate the + /// stride for a linear parameter. + Instruction* generateStrideForParameter(Function *Clone, Argument *Arg, + Instruction *ParmUser, int Stride, + PHINode *Phi); + + /// \brief Utility function that returns true if Inst is a store of a vector + /// or linear parameter. + bool isVectorOrLinearParamStore(Function *Clone, + std::vector &ParmKinds, + Instruction *Inst); + + /// \brief Removes the original scalar alloca instructions that correspond + /// to a vector parameter before widening. + void removeScalarAllocasForVectorParams( + std::vector &VectorParmMap); + + /// \brief Adds metadata to the conditional branch of the simd loop latch to + /// prevent loop unrolling. + void disableLoopUnrolling(BasicBlock *Latch); + + /// \brief Check to see that the type of the gep used for a load instruction + /// is compatible with the type needed as the result of the load. Basically, + /// check the validity of the LLVM IR to make sure that proper pointer + /// dereferencing is done. + bool typesAreCompatibleForLoad(Type *GepType, Type *LoadType); + + bool runOnModule(Module &M) override; + + public: + + static char ID; + VecClone(); + void print(raw_ostream &OS, const Module * = nullptr) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +}; // end pass class + +ModulePass *createVecClonePass(); + +} // end llvm namespace + +#endif // LLVM_TRANSFORMS_UTILS_VECCLONE_H Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -79,6 +79,7 @@ ScopedNoAliasAA.cpp ValueTracking.cpp VectorUtils.cpp + VectorVariant.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/VectorVariant.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" @@ -488,3 +489,106 @@ return Inst; } + +std::vector llvm::getVectorVariantAttributes(Function& F) { + std::vector RetVal; + AttributeSet Attributes = F.getAttributes().getFnAttributes(); + AttributeSet::iterator ItA = Attributes.begin(0); + AttributeSet::iterator EndA = Attributes.end(0); + for (; ItA != EndA; ++ItA) { + if (!ItA->isStringAttribute()) + continue; + StringRef AttributeKind = ItA->getKindAsString(); + if (VectorVariant::isVectorVariant(AttributeKind)) + RetVal.push_back(*ItA); + } + return RetVal; +} + +Type* llvm::calcCharacteristicType(Function& F, VectorVariant& Variant) { + Type* ReturnType = F.getReturnType(); + Type* CharacteristicDataType = NULL; + + if (!ReturnType->isVoidTy()) + CharacteristicDataType = ReturnType; + + if (!CharacteristicDataType) { + + std::vector& ParmKinds = Variant.getParameters(); + const Function::ArgumentListType& Args = F.getArgumentList(); + Function::ArgumentListType::const_iterator ArgIt = Args.begin(); + Function::ArgumentListType::const_iterator ArgEnd = Args.end(); + std::vector::iterator VKIt = ParmKinds.begin(); + + for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) { + if (VKIt->isVector()) { + CharacteristicDataType = (*ArgIt).getType(); + break; + } + } + } + + // TODO except Clang's ComplexType + if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) { + CharacteristicDataType = Type::getInt32Ty(F.getContext()); + } + + // Promote char/short types to int for Xeon Phi. + CharacteristicDataType = + VectorVariant::promoteToSupportedType(CharacteristicDataType, Variant); + + if (CharacteristicDataType->isPointerTy()) { + // For such cases as 'int* foo(int x)', where x is a non-vector type, the + // characteristic type at this point will be i32*. If we use the DataLayout + // to query the supported pointer size, then a promotion to i64* is + // incorrect because the mask element type will mismatch the element type + // of the characteristic type. + PointerType *PointerTy = cast(CharacteristicDataType); + CharacteristicDataType = PointerTy->getElementType(); + } + + return CharacteristicDataType; +} + +void llvm::getFunctionsToVectorize(llvm::Module &M, + FunctionVariants& FuncVars) { + for (auto It = M.begin(), End = M.end(); It != End; ++It) { + Function& F = *It; + auto VariantAttributes = getVectorVariantAttributes(F); + if (VariantAttributes.empty()) + continue; + FuncVars[&F] = DeclaredVariants(); + DeclaredVariants& DeclaredFuncVariants = FuncVars[&F]; + for (auto Attr : VariantAttributes) + DeclaredFuncVariants.push_back(Attr.getKindAsString()); + } +} + +template Constant * +llvm::getConstantValue(Type *Ty, LLVMContext &Context, int Val); +template Constant * +llvm::getConstantValue(Type *Ty, LLVMContext &Context, float Val); +template Constant * +llvm::getConstantValue(Type *Ty, LLVMContext &Context, double Val); + +template +Constant *llvm::getConstantValue(Type *Ty, LLVMContext &Context, T Val) { + Constant *ConstVal = nullptr; + + if (Ty->isIntegerTy(8)) + ConstVal = ConstantInt::get(Type::getInt8Ty(Context), Val); + else if (Ty->isIntegerTy(16)) + ConstVal = ConstantInt::get(Type::getInt16Ty(Context), Val); + else if (Ty->isIntegerTy(32)) + ConstVal = ConstantInt::get(Type::getInt32Ty(Context), Val); + else if (Ty->isIntegerTy(64)) + ConstVal = ConstantInt::get(Type::getInt64Ty(Context), Val); + else if (Ty->isFloatTy()) + ConstVal = ConstantFP::get(Type::getFloatTy(Context), Val); + else if (Ty->isDoubleTy()) + ConstVal = ConstantFP::get(Type::getDoubleTy(Context), Val); + + assert(ConstVal && "Could not generate constant for type"); + + return ConstVal; +} Index: lib/Analysis/VectorVariant.cpp =================================================================== --- lib/Analysis/VectorVariant.cpp +++ lib/Analysis/VectorVariant.cpp @@ -0,0 +1,149 @@ +//===--------- VectorVariant.cpp - Vector function ABI -*- C++ -*----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the VectorVariant class and corresponding utilities. +/// VectorVariant objects are associated with a scalar function and are used +/// to generate new functions that can be vectorized. VectorVariants are +/// determined by inspecting the function attributes associated with the scalar +/// function. When a mangled function name is found in the attributes (indicated +/// as "_ZGV"), a VectorVariant object is created. The class and utilities +/// in this file follow the standards defined in the vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/Type.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +/// \brief Generate a vector variant by decoding the mangled string for the +/// variant contained in the original scalar function's attributes. For +/// example: "_ZGVxN4". The name mangling is defined in the vector function +/// ABI. Based on this string, the parameter kinds (uniform, linear, vector), +/// vector length, parameter alignment, and masking are determined. +VectorVariant::VectorVariant(StringRef FuncName) { + + assert(isVectorVariant(FuncName) && "invalid vector variant format"); + + std::stringstream SST(FuncName.drop_front(prefix().size())); + + // mandatory annotations + char EncodedISA; + SST.get(EncodedISA); + Isa = decodeISAClass(EncodedISA); + + char EncodedMask; + SST.get(EncodedMask); + Mask = decodeMask(EncodedMask); + SST >> Vlen; + + // optional parameter annotations + while (SST.peek() != '_') { + + char Kind; + int Stride = VectorKind::notAValue(); + int StrideSign = POSITIVE; + int Alignment = NOT_ALIGNED; + + // Get parameter kind + SST.get(Kind); + + // Default stride for linear is 1. If the stride for a parameter is 1, + // then the front-end will not encode it and we will not have set the + // correct stride below. + if (Kind == LINEAR_KIND) + Stride = 1; + + // Handle optional stride + if (SST.peek() == 'n') { + // Stride is negative + SST.ignore(1); + StrideSign = NEGATIVE; + } + + if (std::isdigit(SST.peek())) { + // Extract constant stride + SST >> Stride; + assert((Kind != STRIDE_KIND || Stride >= 0) && + "variable stride argument index cannot be negative"); + } + + Stride *= StrideSign; + // Handle optional alignment + if (SST.peek() == 'a') { + SST.ignore(1); + SST >> Alignment; + } + + VectorKind VecKind(Kind, Stride, Alignment); + Parameters.push_back(VecKind); + } + + if (Mask) { + // Masked variants will have an additional mask parameter + VectorKind VecKind(VECTOR_KIND, VectorKind::notAValue()); + Parameters.push_back(VecKind); + } +} + +/// \brief Determine the vector variant's vector length based on the +/// characteristic data type defined in the vector function ABI and target +/// vector register width. +unsigned int VectorVariant::calcVlen(ISAClass I, + Type* CharacteristicDataType) { + assert(CharacteristicDataType && + CharacteristicDataType->getPrimitiveSizeInBits() != 0 && + "expected characteristic data type to have a primitive size in bits"); + + unsigned int VectorRegisterSize = + maximumSizeofISAClassVectorRegister(I, CharacteristicDataType); + + return VectorRegisterSize / CharacteristicDataType->getPrimitiveSizeInBits(); +} + +/// \brief Determine the maximum vector register width based on the ISA classes +/// defined in the vector function ABI. +unsigned int VectorVariant::maximumSizeofISAClassVectorRegister(ISAClass I, + Type* Ty) +{ + assert((Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() || + Ty->isPointerTy()) && "unsupported type"); + + unsigned int VectorRegisterSize = 0; + + switch (I) { + case XMM: + VectorRegisterSize = 128; + break; + case YMM1: + if (Ty->isIntegerTy() || Ty->isPointerTy()) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case YMM2: + if (Ty->isIntegerTy(8)) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case ZMM: + VectorRegisterSize = 512; + break; + default: + llvm_unreachable("unknown isa class"); + return 0; + } + + assert(VectorRegisterSize != 0 && "unsupported ISA/type combination"); + return VectorRegisterSize; +} Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -38,6 +38,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Utils/VecClone.h" using namespace llvm; @@ -102,6 +103,10 @@ "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); +static cl::opt RunVecClone("enable-vec-clone", + cl::init(true), cl::Hidden, + cl::desc("Run Vector Function Cloning")); + static cl::opt EnableNonLTOGlobalsModRef( "enable-non-lto-gmr", cl::init(true), cl::Hidden, cl::desc( @@ -376,6 +381,10 @@ MPM.add(createBarrierNoopPass()); addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); + + if (RunVecClone) + MPM.add(createVecClonePass()); + return; } @@ -513,6 +522,9 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false)); + if (RunVecClone) + MPM.add(createVecClonePass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -44,6 +44,7 @@ UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp + VecClone.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms Index: lib/Transforms/Utils/VecClone.cpp =================================================================== --- lib/Transforms/Utils/VecClone.cpp +++ lib/Transforms/Utils/VecClone.cpp @@ -0,0 +1,1553 @@ +//=------- VecClone.cpp - Vector function to loop transform -*- C++ -*-------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This pass inserts the body of a vector function inside a vector length +/// trip count scalar loop for functions that are declared SIMD. The pass +/// follows the Intel vector ABI requirements for name mangling encodings. +/// +// ===--------------------------------------------------------------------=== // + +// This pass is flexible enough to deal with two forms of LLVM IR, namely the +// difference between when Mem2Reg has been run and when Mem2Reg has not run. +// i.e., the pass needs to be able to make the loop transformation at all +// opt levels. +// +// When Mem2Reg has run: +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %add = add nsw i32 %x, %i +// ret i32 %add +// } +// +// When Mem2Reg has not run: +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %i.addr = alloca i32, align 4 +// %x.addr = alloca i32, align 4 +// store i32 %i, i32* %i.addr, align 4 +// store i32 %x, i32* %x.addr, align 4 +// %0 = load i32, i32* %x.addr, align 4 +// %1 = load i32, i32* %i.addr, align 4 +// %add = add nsw i32 %0, %1 +// ret i32 %add +// } +// +// When Mem2Reg has not been run (i.e., parameters have not been registerized), +// we end up with an alloca, store to alloca, and load from alloca sequence. +// When parameters have already been registerized, users of the parameter use +// the parameter directly and not through a load of a new SSA temp. For either +// case, this pass will expand the vector parameters/return to vector types, +// alloca new space for them on the stack, and do an initial store to the +// alloca. Linear and uniform parameters will be used directly, instead of +// through a load instruction. + +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/Transforms/Utils/VecClone.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" +#include +#include + +#define SV_NAME "vec-clone" +#define DEBUG_TYPE "VecClone" + +using namespace llvm; + +VecClone::VecClone() : ModulePass(ID) { } + +void VecClone::getAnalysisUsage(AnalysisUsage &AU) const { + // Placeholder for any new pass dependencies. For now, none are needed. +} + +void VecClone::insertInstruction(Instruction *Inst, BasicBlock *BB) +{ + // This function inserts instructions in a way that groups like instructions + // together for debuggability/readability purposes. This was designed to make + // the entry basic block easier to read since this pass creates/modifies + // alloca, store, and bitcast instructions for each vector parameter and + // return. Thus, this function ensures all allocas are grouped together, all + // stores are grouped together, and so on. If the type of instruction passed + // in does not exist in the basic block, then it is added to the end of the + // basic block, just before the terminator instruction. + + BasicBlock::reverse_iterator BBIt = BB->rbegin(); + BasicBlock::reverse_iterator BBEnd = BB->rend(); + BasicBlock::iterator AnchorInstIt = BB->end(); + AnchorInstIt--; + Instruction *Anchor = &*AnchorInstIt; + + for (; BBIt != BBEnd; ++BBIt) { + if (Inst->getOpcode() == (&*BBIt)->getOpcode()) { + Anchor = &*BBIt; + break; + } + } + + if (isa(Anchor)) { + Inst->insertBefore(Anchor); + } else { + Inst->insertAfter(Anchor); + } +} + +bool VecClone::hasComplexType(Function *F) +{ + Function::ArgumentListType &ArgList = F->getArgumentList(); + Function::ArgumentListType::iterator ArgListIt = ArgList.begin(); + Function::ArgumentListType::iterator ArgListEnd = ArgList.end(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + // Complex types for parameters/return come in as vector. + if (ArgListIt->getType()->isVectorTy()) { + return true; + } + } + + return false; +} + +Function* VecClone::cloneFunction(Function &F, VectorVariant &V) +{ + + DEBUG(dbgs() << "Cloning Function: " << F.getName() << "\n"); + DEBUG(F.dump()); + + FunctionType* OrigFunctionType = F.getFunctionType(); + Type *ReturnType = F.getReturnType(); + Type *CharacteristicType = calcCharacteristicType(F, V); + + // Expand return type to vector. + if (!ReturnType->isVoidTy()) + ReturnType = VectorType::get(ReturnType, V.getVlen()); + + std::vector ParmKinds = V.getParameters(); + SmallVector ParmTypes; + FunctionType::param_iterator ParmIt = OrigFunctionType->param_begin(); + FunctionType::param_iterator ParmEnd = OrigFunctionType->param_end(); + std::vector::iterator VKIt = ParmKinds.begin(); + for (; ParmIt != ParmEnd; ++ParmIt, ++VKIt) { + if (VKIt->isVector()) + ParmTypes.push_back(VectorType::get((*ParmIt)->getScalarType(), + V.getVlen())); + else + ParmTypes.push_back(*ParmIt); + } + + if (V.isMasked()) { + VectorType *MaskType = VectorType::get(CharacteristicType, V.getVlen()); + ParmTypes.push_back(MaskType); + } + + FunctionType* CloneFuncType = FunctionType::get(ReturnType, ParmTypes, + false); + + std::string VariantName = V.generateFunctionName(F.getName()); + Function* Clone = Function::Create(CloneFuncType, F.getLinkage(), + VariantName, F.getParent()); + + // Remove vector variant attributes from the original function. They are + // not needed for the cloned function and it prevents any attempts at + // trying to clone the function again in case the pass is called more than + // once. + AttrBuilder AB; + for (auto Attr : getVectorVariantAttributes(F)) { + AB.addAttribute(Attr); + } + AttributeSet AttrsToRemove = AttributeSet::get(F.getContext(), + AttributeSet::FunctionIndex, + AB); + + F.removeAttributes(AttributeSet::FunctionIndex, AttrsToRemove); + + // Copy all the attributes from the scalar function to its vector version + // except for the vector variant attributes. + Clone->copyAttributesFrom(&F); + + // Remove incompatible argument attributes (applied to the scalar argument, + // does not apply to its vector counterpart). + Function::arg_iterator ArgIt = Clone->arg_begin(); + Function::arg_iterator ArgEnd = Clone->arg_end(); + for (uint64_t Idx = 1; ArgIt != ArgEnd; ++ArgIt, ++Idx) { + Type* ArgType = (*ArgIt).getType(); + AB = AttributeFuncs::typeIncompatible(ArgType); + AttributeSet AS = AttributeSet::get(Clone->getContext(), Idx, AB); + (*ArgIt).removeAttr(AS); + } + + ValueToValueMapTy Vmap; + ArgIt = F.arg_begin(); + ArgEnd = F.arg_end(); + Function::arg_iterator NewArgIt = Clone->arg_begin(); + for (; ArgIt != ArgEnd; ++ArgIt, ++NewArgIt) { + NewArgIt->setName(ArgIt->getName()); + Vmap[&*ArgIt] = &*NewArgIt; + } + + if (V.isMasked()) { + Argument &MaskArg = *NewArgIt; + MaskArg.setName("mask"); + } + + SmallVector Returns; + CloneFunctionInto(Clone, &F, Vmap, true, Returns); + + DEBUG(dbgs() << "After Cloning and Parameter/Return Expansion\n"); + DEBUG(Clone->dump()); + + return Clone; +} + +bool VecClone::isVectorOrLinearParamStore( + Function *Clone, + std::vector &ParmKinds, + Instruction *Inst) +{ + if (StoreInst *Store = dyn_cast(Inst)) { + Value *Op0 = Store->getOperand(0); + Function::ArgumentListType &ArgList = Clone->getArgumentList(); + Function::ArgumentListType::iterator ArgListIt = ArgList.begin(); + Function::ArgumentListType::iterator ArgListEnd = ArgList.end(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + unsigned ParmIdx = ArgListIt->getArgNo(); + if (&*ArgListIt == Op0 && + (ParmKinds[ParmIdx].isVector() || ParmKinds[ParmIdx].isLinear())) { + return true; + } + } + } + + return false; +} + +BasicBlock* VecClone::splitEntryIntoLoop(Function *Clone, VectorVariant &V, + BasicBlock *EntryBlock) +{ + + // EntryInsts contains all instructions that need to stay in the entry basic + // block. These instructions include allocas and stores involving vector and + // linear parameters to alloca. Linear parameter stores to alloca are kept in + // the entry block because there will be a load from this alloca in the loop + // for which we will apply the stride. Instructions involving uniform + // parameter stores to alloca should be sunk into the loop to maintain + // uniform behavior. All instructions involving private variables are also + // sunk into the loop. + + SmallVector EntryInsts; + std::vector ParmKinds = V.getParameters(); + BasicBlock::iterator BBIt = EntryBlock->begin(); + BasicBlock::iterator BBEnd = EntryBlock->end(); + + for (; BBIt != BBEnd; ++BBIt) { + if (isa(BBIt) || + isVectorOrLinearParamStore(Clone, ParmKinds, &*BBIt)) { + // If this is a store of a vector parameter, keep it in the entry block + // because it will be modified with the vector alloca reference. Since the + // parameter has already been expanded, this becomes a vector store (i.e., + // packing instruction) that we do not want to appear in the scalar loop. + // It is correct to leave linear parameter stores in the entry or move + // them to the scalar loop, but leaving them in the entry block prevents + // an additional store inside the loop. Uniform parameter stores must be + // moved to the loop body to behave as uniform. Consider the following: + // + // __declspec(vector(uniform(x))) + // int foo(int a, int x) { + // x++; + // return (a + x); + // } + // + // Assume x = 1 for the call to foo. This implies x = 2 for the vector + // add. e.g., a[0:VL-1] + <2, 2, 2, 2>. If the initial store of x to the + // stack is done in the entry block outside of the loop, then x will be + // incremented by one each time within the loop because the increment of + // x will reside in the loop. Therefore, if the store of x is sunk into + // the loop, the initial value of 1 will always be stored to a temp + // before the increment, resulting in the value of 2 always being computed + // in the scalar loop. + EntryInsts.push_back(&*BBIt); + } + } + + BasicBlock *LoopBlock = EntryBlock->splitBasicBlock(EntryBlock->begin(), + "simd.loop"); + + for (auto *Inst : EntryInsts) { + Inst->removeFromParent(); + Inst->insertBefore(EntryBlock->getTerminator()); + } + + DEBUG(dbgs() << "After Entry Block Split\n"); + DEBUG(Clone->dump()); + + return LoopBlock; +} + +BasicBlock* VecClone::splitLoopIntoReturn(Function *Clone, + BasicBlock *LoopBlock) +{ + + // Determine the basic block with the return. For simple cases, the 'ret' + // instruction will be part of the entry block. In this case, separate the + // 'ret' into a new basic block because we don't want this as part of the + // loop. For more complex cases, the 'ret' and corresponding instructions + // (i.e., load from auto variable) will already be in a separate basic block, + // so no need to split here. + + Instruction *SplitPt = LoopBlock->getTerminator(); + + if (ReturnInst *Return = dyn_cast(SplitPt)) { + + // If the return is from a preceeding load, make sure the load is also put + // in the return block. This is the old scalar load that will end up getting + // replaced with the vector return and will get cleaned up later. + + // Make sure this is not a void function before getting the return + // operand. + if (!Clone->getReturnType()->isVoidTy()) { + Value *RetOp = Return->getOperand(0); + Value::use_iterator UseIt = RetOp->use_begin(); + Value::use_iterator UseEnd = RetOp->use_end(); + + for (; UseIt != UseEnd; ++UseIt) { + LoadInst *RetLoad = dyn_cast(*UseIt); + if (RetLoad) { + SplitPt = RetLoad; + } + } + } + } + + Function::iterator ReturnBlockIt = Clone->end(); + BasicBlock *ReturnBlock; + if (dyn_cast(SplitPt) || dyn_cast(SplitPt)) { + ReturnBlock = LoopBlock->splitBasicBlock(SplitPt, "return"); + } else { + ReturnBlockIt = Clone->end(); + ReturnBlockIt--; + ReturnBlock = &*ReturnBlockIt; + } + + return ReturnBlock; +} + +void VecClone::updateReturnPredecessors(Function *Clone, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock) +{ + // Update the branches of the ReturnBlock predecessors to point back to + // LoopBlock if the index is less than VL. + + // First, collect the basic blocks to be updated since we don't want to update + // the CFG while iterating through it. + SmallVector BranchesToUpdate; + Function::iterator FI = Clone->begin(); + Function::iterator FE = Clone->end(); + for (; FI != FE; ++FI) { + + BasicBlock::iterator BBI = FI->begin(); + BasicBlock::iterator BBE = FI->end(); + + for (; BBI != BBE; ++BBI) { + + BranchInst *Branch = dyn_cast(BBI); + + if (Branch) { + unsigned NumSucc = Branch->getNumSuccessors(); + + for (unsigned I = 0; I < NumSucc; ++I) { + if (Branch->getSuccessor(I) == ReturnBlock) { + BranchesToUpdate.push_back(Branch); + } + } + } + } + } + + // Now, do the actual update. The code below handles both conditional and + // unconditional branches because we loop through all successors of the + // branch to see if any of them point to the ReturnBlock. + for (unsigned I = 0; I < BranchesToUpdate.size(); ++I) { + unsigned int NumOps = BranchesToUpdate[I]->getNumSuccessors(); + for (unsigned Idx = 0; Idx < NumOps; ++Idx) { + BasicBlock *Successor = BranchesToUpdate[I]->getSuccessor(Idx); + if (Successor == ReturnBlock) { + BranchesToUpdate[I]->setOperand(Idx, LoopExitBlock); + } + } + } +} + +BasicBlock* VecClone::createLoopExit(Function *Clone, BasicBlock *ReturnBlock) +{ + BasicBlock *LoopExitBlock = BasicBlock::Create(Clone->getContext(), + "simd.loop.exit", + Clone, ReturnBlock); + + updateReturnPredecessors(Clone, LoopExitBlock, ReturnBlock); + return LoopExitBlock; +} + +PHINode* VecClone::createPhiAndBackedgeForLoop( + Function *Clone, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock, + int VectorLength) +{ + + // Create the phi node for the top of the loop block and add the back + // edge to the loop from the loop exit. + + PHINode *Phi = PHINode::Create(Type::getInt32Ty(Clone->getContext()), 2, + "index", &*LoopBlock->getFirstInsertionPt()); + + Constant *Inc = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 1); + Constant *IndInit = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), + 0); + + Instruction *Induction = BinaryOperator::CreateNUWAdd(Phi, Inc, "indvar", + LoopExitBlock); + + Constant *VL = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), + VectorLength); + + Instruction *VLCmp = new ICmpInst(*LoopExitBlock, CmpInst::ICMP_ULT, + Induction, VL, "vl.cond"); + + BranchInst::Create(LoopBlock, ReturnBlock, VLCmp, LoopExitBlock); + + Phi->addIncoming(IndInit, EntryBlock); + Phi->addIncoming(Induction, LoopExitBlock); + + DEBUG(dbgs() << "After Loop Insertion\n"); + DEBUG(Clone->dump()); + + return Phi; +} + +Instruction* VecClone::expandVectorParameters( + Function *Clone, + VectorVariant &V, + BasicBlock *EntryBlock, + std::vector& VectorParmMap) +{ + // For vector parameters, expand the existing alloca to a vector. Then, + // bitcast the vector and store this instruction in a map. The map is later + // used to insert the new instructions and to replace the old scalar memory + // references. If there are no parameters, then the function simply does not + // perform any expansion since we iterate over the function's arg list. + + Instruction *Mask = nullptr; + SmallVector StoresToInsert; + + Function::ArgumentListType &CloneArgList = Clone->getArgumentList(); + Function::ArgumentListType::iterator ArgIt = CloneArgList.begin(); + Function::ArgumentListType::iterator ArgEnd = CloneArgList.end(); + + for (; ArgIt != ArgEnd; ++ArgIt) { + + User::user_iterator UserIt = ArgIt->user_begin(); + User::user_iterator UserEnd = ArgIt->user_end(); + + VectorType *VecType = dyn_cast(ArgIt->getType()); + + if (VecType) { + + // Create a new vector alloca and bitcast to a pointer to the + // element type. The following is an example of what the cast + // should look like: + // + // %veccast = bitcast <2 x i32>* %vec_a.addr to i32* + // + // geps using the bitcast will appear in a scalar form instead + // of casting to an array or using vector. For example, + // + // %vecgep1 = getelementptr i32, i32* %veccast, i32 %index + // + // instead of: + // + // getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 1 + // + // We do this to put the geps in a more scalar form. + + Twine VarName = "vec." + ArgIt->getName(); + AllocaInst *VecAlloca = new AllocaInst(VecType, VarName); + insertInstruction(VecAlloca, EntryBlock); + PointerType *ElemTypePtr = + PointerType::get(VecType->getElementType(), + VecAlloca->getType()->getAddressSpace()); + + BitCastInst *VecParmCast = nullptr; + if (ArgIt->getNumUses() == 0 && V.isMasked()) { + Mask = new BitCastInst(VecAlloca, ElemTypePtr, "mask.cast"); + } else { + Twine CastName = "vec." + ArgIt->getName() + ".cast"; + VecParmCast = new BitCastInst(VecAlloca, ElemTypePtr, CastName); + insertInstruction(VecParmCast, EntryBlock); + } + + for (; UserIt != UserEnd; ++UserIt) { + + StoreInst *StoreUser = dyn_cast(*UserIt); + AllocaInst *Alloca = NULL; + ParmRef *PRef = new ParmRef(); + + if (StoreUser) { + + // For non-mask parameters, find the initial store of the parameter + // to an alloca instruction. Map this alloca to the vector bitcast + // created above so that we can update the old scalar references. + + Alloca = dyn_cast(UserIt->getOperand(1)); + PRef->VectorParm = Alloca; + } else { + // Since Mem2Reg has run, there is no existing scalar store for + // the parameter, but we must still pack (store) the expanded vector + // parameter to a new vector alloca. This store is created here and + // put in a container for later insertion. We cannot insert it here + // since this will be a new user of the parameter and we are still + // iterating over the original users of the parameter. This will + // invalidate the iterator. We also map the parameter directly to the + // vector bitcast so that we can later update any users of the + // parameter. + + Value *ArgValue = dyn_cast(ArgIt); + StoreInst *Store = new StoreInst(ArgValue, VecAlloca); + StoresToInsert.push_back(Store); + PRef->VectorParm = ArgValue; + } + + PRef->VectorParmCast = VecParmCast; + VectorParmMap.push_back(PRef); + } + } + } + + // Insert any necessary vector parameter stores here. This is needed for when + // there were no existing scalar stores that we can update to vector stores + // for the parameter. This is needed when Mem2Reg has registerized parameters. + // The stores are inserted after the allocas in the entry block. + for (auto *Inst : StoresToInsert) { + insertInstruction(Inst, EntryBlock); + } + + return Mask; +} + +Instruction* VecClone::createExpandedReturn(Function *Clone, + BasicBlock *EntryBlock, + VectorType *ReturnType) +{ + // Expand the return temp to a vector. + + Twine VarName = "vec.retval"; + VectorType *AllocaType = dyn_cast(Clone->getReturnType()); + + AllocaInst *VecAlloca = new AllocaInst(AllocaType, VarName); + insertInstruction(VecAlloca, EntryBlock); + PointerType *ElemTypePtr = + PointerType::get(ReturnType->getElementType(), + VecAlloca->getType()->getAddressSpace()); + + BitCastInst *VecCast = new BitCastInst(VecAlloca, ElemTypePtr, "ret.cast"); + insertInstruction(VecCast, EntryBlock); + + return VecCast; +} + +Instruction* VecClone::expandReturn(Function *Clone, BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *ReturnBlock, + std::vector& VectorParmMap) +{ + // Determine how the return is currently handled, since this will determine + // if a new vector alloca is required for it. For simple functions, an alloca + // may not have been created for the return value. The function may just + // simply return a value defined by some operation that now exists within the + // loop. If an alloca was generated already, then the return block will load + // from it and then return. Thus, we look for a return resulting from a load + // in the return block. If found, we have already expanded all alloca + // instructions to vector types and the old scalar references have already + // been replaced with them. In this case, we only need to pack the results + // from the vector alloca into a temp and return the temp. If a vector alloca + // was not generated for the return, we need to add one for it because we have + // a scalar reference in the loop that needs to be replaced. After creating + // the new vector alloca, replace the reference to it in the loop and then + // pack the results into a temp and return it. + // + // Example 1: // alloca not generated in entry block + // + // loop: + // ... // some set of instructions + // %add1 = add nsw i32 %1, %2 + // br label %loop.exit (loop exit contains br to return block) + // + // return: + // ret i32 %add1 + // + // + // Example 2: + // + // loop: + // ... // some set of instructions + // %vecgep1 = getelementptr <2 x i32>* %vec_ret, i32 0, i32 %index + // store i32 %add2, i32* %vecgep1 + // br label %loop.exit (loop exit contains br to return block) + // + // return: + // %7 = load i32, i32*, %retval // the original scalar alloca + // ret i32 %7 + // + + ReturnInst *FuncReturn = dyn_cast(ReturnBlock->getTerminator()); + assert(FuncReturn && "Expected ret instruction to terminate the return\ + basic block"); + + LoadInst *LoadFromAlloca = dyn_cast(FuncReturn->getOperand(0)); + + // We need to generate a vector alloca for the return vector. + // Two cases exist, here: + // + // 1) For simple functions, the return is a temp defined within the + // loop body and the temp is not loaded from an alloca, or the return is + // a constant. (obviously, also not loaded from an alloca) + // + // 2) The return temp traces back to an alloca. + // + // For both cases, generate a vector alloca so that we can later load from it + // and return the vector temp from the function. The alloca is used to load + // and store from so that the scalar loop contains load/store/gep + // instructions. This enables AVR construction to remain straightforward. + // E.g., we don't need to worry about figuring out how to represent + // insert/extract when building AVR nodes. This keeps consistent with how ICC + // is operating. + // + // Additionally, for case 1 we must generate a gep and store after the + // instruction that defines the original return temp, so that we can store + // the result into the proper index of the return vector. For case 2, we must + // go into the loop and replace the old scalar alloca reference with the one + // just created as vector. + + Instruction *VecReturn = NULL; + VectorType *ReturnType = dyn_cast(Clone->getReturnType()); + + if (!LoadFromAlloca) { + + // Case 1 + + VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType); + Value *RetVal = FuncReturn->getReturnValue(); + Instruction *RetFromTemp = dyn_cast(RetVal); + + Instruction *InsertPt; + Value *ValToStore; + Instruction *Phi = &*LoopBlock->begin(); + + if (RetFromTemp) { + // If we're returning from an SSA temp, set the insert point to the + // definition of the temp. + InsertPt = RetFromTemp; + ValToStore = RetFromTemp; + } else { + // If we're returning a constant, then set the insert point to the loop + // phi. From here, a store to the vector using the constant is inserted. + InsertPt = Phi; + ValToStore = RetVal; + } + + // Generate a gep from the bitcast of the vector alloca used for the return + // vector. + Twine GepName = VecReturn->getName() + ".gep"; + GetElementPtrInst *VecGep = + GetElementPtrInst::Create(ReturnType->getElementType(), + VecReturn, Phi, GepName); + VecGep->insertAfter(InsertPt); + + // Store the constant or temp to the appropriate lane in the return vector. + StoreInst *VecStore = new StoreInst(ValToStore, VecGep); + VecStore->insertAfter(VecGep); + + } else { + + // Case 2 + + AllocaInst *Alloca = dyn_cast(LoadFromAlloca->getOperand(0)); + bool AllocaFound = false; + unsigned ParmIdx = 0; + + for (; ParmIdx < VectorParmMap.size(); ParmIdx++) { + Value *ParmVal = VectorParmMap[ParmIdx]->VectorParm; + if (ParmVal == Alloca) + AllocaFound = true; + } + + if (AllocaFound) { + // There's already a vector alloca created for the return, which is the + // same one used for the parameter. E.g., we're returning the updated + // parameter. + VecReturn = VectorParmMap[ParmIdx]->VectorParmCast; + } else { + // A new return vector is needed because we do not load the return value + // from an alloca. + VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType); + ParmRef *PRef = new ParmRef(); + PRef->VectorParm = Alloca; + PRef->VectorParmCast = VecReturn; + VectorParmMap.push_back(PRef); + } + } + + return VecReturn; +} + +Instruction* VecClone::expandVectorParametersAndReturn( + Function *Clone, + VectorVariant &V, + Instruction **Mask, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *ReturnBlock, + std::vector& VectorParmMap) +{ + // If there are no parameters, then this function will do nothing and this + // is the expected behavior. + *Mask = expandVectorParameters(Clone, V, EntryBlock, VectorParmMap); + + // If the function returns void, then don't attempt to expand to vector. + Instruction *ExpandedReturn = ReturnBlock->getTerminator(); + if (!Clone->getReturnType()->isVoidTy()) { + ExpandedReturn = expandReturn(Clone, EntryBlock, LoopBlock, ReturnBlock, + VectorParmMap); + } + + // So, essentially what has been done to this point is the creation and + // insertion of the vector alloca instructions. Now, we insert the bitcasts of + // those instructions, which have been stored in the map. The insertion of the + // vector bitcast to element type pointer is done at the end of the EntryBlock + // to ensure that any initial stores of vector parameters have been done + // before the cast. + + std::vector::iterator MapIt; + for (auto MapIt : VectorParmMap) { + Instruction *ExpandedCast = MapIt->VectorParmCast; + if (!ExpandedCast->getParent()) { + insertInstruction(ExpandedCast, EntryBlock); + } + } + + // Insert the mask parameter store to alloca and bitcast if this is a masked + // variant. + if (*Mask) { + // Mask points to the bitcast of the alloca instruction to element type + // pointer. Insert the bitcast after all of the other bitcasts for vector + // parameters. + insertInstruction(*Mask, EntryBlock); + + Value *MaskVector = (*Mask)->getOperand(0); + + // MaskParm points to the function's mask parameter. + Function::ArgumentListType &ArgList = Clone->getArgumentList(); + Function::ArgumentListType::iterator MaskParm = ArgList.end(); + MaskParm--; + + // Find the last parameter store in the function entry block and insert the + // the store of the mask parameter after it. We do this just to make the + // LLVM IR easier to read. If there are no parameters, just insert the store + // before the terminator. For safety, if we cannot find a store, then insert + // this store after the last alloca. At this point, there will at least be + // an alloca for either a parameter or return. This code just ensures that + // the EntryBlock instructions are grouped by alloca, followed by store, + // followed by bitcast for readability reasons. + + StoreInst *MaskStore = new StoreInst(&*MaskParm, MaskVector); + insertInstruction(MaskStore, EntryBlock); + } + + DEBUG(dbgs() << "After Parameter/Return Expansion\n"); + DEBUG(Clone->dump()); + + return ExpandedReturn; +} + +bool VecClone::typesAreCompatibleForLoad(Type *GepType, Type *LoadType) +{ + // GepType will always be a pointer since this refers to an alloca for a + // vector. + PointerType *GepPtrTy = dyn_cast(GepType); + Type *LoadFromTy = GepPtrTy->getElementType(); + Type *LoadToTy = LoadType; + + // Dereferencing pointers in LLVM IR means that we have to have a load for + // each level of indirection. This means that we load from a gep and the + // resulting load value type is reduced by one level of indirection. For + // example, we load from a gep of i32* to a temp that has an i32 type. We + // cannot do multiple levels of dereferencing in a single load. For example, + // we cannot load from a gep of i32** to an i32. This requires two loads. + // + // Legal Case: GepType = i32**, LoadFromTy = i32*, + // LoadType = i32*, LoadToTy = i32* + // + // %vec.b.elem.2 = load i32*, i32** %vec.b.cast.gep1 + // + // In this case, since both are pointers, types will be considered equal by + // LLVM, so we must continue getting the element types of each pointer type + // until one is no longer a pointer type. Then do an equality check. + // + // Legal Case: GepType = i32*, LoadFromTy = i32, + // LoadType = i32, LoadToTy = i32 + // + // %vec.b.elem.2 = load i32, i32* %vec.b.cast.gep1 + // + // Ready to compare as is + // + // Illegal Case: GepType = i32**, LoadFromTy = i32* + // LoadType = i32, LoadToTy = i32 + // + // %vec.b.elem.2 = load i32, i32** %vec.b.cast.gep1 + // + // This case arises due to differences in the LLVM IR at -O0 and >= -O1. + // For >= -O1, Mem2Reg registerizes parameters and there are no alloca + // instructions created for function parameters. At -O0, vector parameters + // are expanded and we modify the existing alloca that was used for the scalar + // parameter. When there is no alloca for vector parameters, we must create + // one for them. Thus, we have introduced an additional level of indirection + // for users of parameters at >= -O1. This can become a problem for load + // instructions and results in this illegal case. This function helps to + // check that we are not attempting to do an extra level of indirection + // within the load instructions for elements of vector parameters in the + // simd loop. If an illegal case is encountered, an additional load is + // inserted to account for the extra level of indirection and any users are + // updated accordingly. + + while (LoadFromTy->getTypeID() == Type::PointerTyID && + LoadToTy->getTypeID() == Type::PointerTyID) { + + PointerType *FromPtrTy = cast(LoadFromTy); + PointerType *ToPtrTy = cast(LoadToTy); + + LoadFromTy = FromPtrTy->getElementType(); + LoadToTy = ToPtrTy->getElementType(); + } + + if (LoadFromTy->getTypeID() == LoadToTy->getTypeID()) { + return true; + } + + return false; +} + +void VecClone::updateScalarMemRefsWithVector( + Function *Clone, + Function &F, + BasicBlock *EntryBlock, + BasicBlock *ReturnBlock, + PHINode *Phi, + std::vector& VectorParmMap) +{ + // This function replaces the old scalar uses of a parameter with a reference + // to the new vector one. A gep is inserted using the vector bitcast created + // in the entry block and any uses of the parameter are replaced with this + // gep. The only users that will not be updated are those in the entry block + // that do the initial store to the vector alloca of the parameter. + + std::vector::iterator VectorParmMapIt; + + for (auto VectorParmMapIt : VectorParmMap) { + + SmallVector InstsToUpdate; + Value *Parm = VectorParmMapIt->VectorParm; + Instruction *Cast = VectorParmMapIt->VectorParmCast; + + for (User *U : Parm->users()) { + InstsToUpdate.push_back(dyn_cast(U)); + } + + for (unsigned I = 0; I < InstsToUpdate.size(); ++I) { + + Instruction *User = InstsToUpdate[I]; + if (!(dyn_cast(User) && User->getParent() == EntryBlock)) { + + BitCastInst *BitCast = dyn_cast(Cast); + PointerType *BitCastType = dyn_cast(BitCast->getType()); + Type *PointeeType = BitCastType->getElementType(); + + Twine GepName = BitCast->getName() + ".gep"; + GetElementPtrInst *VecGep = + GetElementPtrInst::Create(PointeeType, BitCast, Phi, GepName, + User); + + unsigned NumOps = User->getNumOperands(); + for (unsigned I = 0; I < NumOps; ++I) { + if (User->getOperand(I) == Parm) { + + bool TypesAreCompatible = false; + + if (isa(User)) { + TypesAreCompatible = + typesAreCompatibleForLoad(VecGep->getType(), User->getType()); + } + + if ((isa(User) && TypesAreCompatible) || + isa(User)) { + // If the user is a load/store and the dereferencing is legal, + // then just modify the load/store operand to use the gep. + User->setOperand(I, VecGep); + } else { + // Otherwise, we need to load the value from the gep first before + // using it. This effectively loads the particular element from + // the vector parameter. + Twine LoadName = "vec." + Parm->getName() + ".elem"; + LoadInst *ParmElemLoad = new LoadInst(VecGep, LoadName); + ParmElemLoad->insertAfter(VecGep); + User->setOperand(I, ParmElemLoad); + } + } + } + } else { + // The user is the parameter store to alloca in the entry block. Replace + // the old scalar alloca with the new vector one. + AllocaInst *VecAlloca = dyn_cast(Cast->getOperand(0)); + User->setOperand(1, VecAlloca); + } + } + } + + DEBUG(dbgs() << "After Alloca Replacement\n"); + DEBUG(Clone->dump()); +} + +Instruction* VecClone::generateStrideForParameter( + Function *Clone, + Argument *Arg, + Instruction *ParmUser, + int Stride, + PHINode *Phi) +{ + // Value returned as the last instruction needed to update the users of the + // old parameter reference. + Instruction *StrideInst = nullptr; + + Constant *StrideConst = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), Stride); + + Instruction *Mul = BinaryOperator::CreateMul(StrideConst, Phi, "stride.mul"); + + // Insert the stride related instructions after the user if the instruction + // involves a redefinition of the parameter. For example, a load from the + // parameter's associated alloca or a cast. For these situations, we want to + // apply the stride to this SSA temp. For other instructions, e.g., add, the + // instruction computing the stride must be inserted before the user. + + if (!isa(ParmUser)) { + Mul->insertBefore(ParmUser); + } else { + Mul->insertAfter(ParmUser); + } + + if (Arg->getType()->isPointerTy()) { + + // Linear updates to pointer parameters involves an address calculation, so + // use gep. To properly update linear pointers we only need to multiply the + // loop index and stride since gep is indexed starting at 0 from the base + // address passed to the vector function. + PointerType *ParmPtrType = dyn_cast(Arg->getType()); + + // The base address used for linear gep computations. + Value *BaseAddr = nullptr; + StringRef RefName; + + if (LoadInst *ParmLoad = dyn_cast(ParmUser)) { + // We are loading from the alloca of the pointer parameter (no Mem2Reg) + // i.e., loading a pointer to an SSA temp. + BaseAddr = ParmUser; + RefName = ParmLoad->getOperand(0)->getName(); + } else { + // The user is using the pointer parameter directly. + BaseAddr = Arg; + RefName = BaseAddr->getName(); + } + + // Mul is always generated as i32 since it is calculated using the i32 loop + // phi that is inserted by this pass. No cast on Mul is necessary because + // gep can use a base address of one type with an index of another type. + Twine GepName = RefName + ".gep"; + GetElementPtrInst *LinearParmGep = + GetElementPtrInst::Create(ParmPtrType->getElementType(), + BaseAddr, Mul, GepName); + + LinearParmGep->insertAfter(Mul); + StrideInst = LinearParmGep; + } else { + // For linear values, a mul/add sequence is needed to generate the correct + // value. i.e., val = linear_var * stride + loop_index; + // + // Also, Mul above is generated as i32 because the phi type is always i32. + // However, ParmUser may be another integer type, so we must convert i32 to + // i8/i16/i64 when the user is not i32. + + if (ParmUser->getType()->getIntegerBitWidth() != + Mul->getType()->getIntegerBitWidth()) { + + Instruction *MulConv = + CastInst::CreateIntegerCast(Mul, ParmUser->getType(), true, + "stride.cast"); + MulConv->insertAfter(Mul); + Mul = MulConv; + } + + // Generate the instruction that computes the stride. + BinaryOperator *Add; + StringRef TempName = "stride.add"; + if (isa(ParmUser)) { + // The user of the parameter is an instruction that results in a + // redefinition of it. e.g., a load from an alloca (no Mem2Reg) or a cast + // instruction. In either case, the stride needs to be applied to this + // temp. + Add = BinaryOperator::CreateAdd(ParmUser, Mul, TempName); + } else { + // Otherwise, the user is an instruction that does not redefine the temp, + // such as an add instruction. For these cases, the stride must be + // computed before the user and the reference to the parameter must be + // replaced with this instruction. + Add = BinaryOperator::CreateAdd(Arg, Mul, TempName); + } + + Add->insertAfter(Mul); + StrideInst = Add; + } + + return StrideInst; +} + +void VecClone::updateLinearReferences(Function *Clone, Function &F, + VectorVariant &V, PHINode *Phi) +{ + // Add stride to parameters marked as linear. This is done by finding all + // users of the scalar alloca associated with the parameter. The user should + // be a load from this alloca to a temp. The stride is then added to this temp + // and its uses are replaced with the new temp. Or, if Mem2Reg eliminates the + // alloca/load, the parameter is used directly and this use is updated with + // the stride. + + Function::ArgumentListType &ArgList = Clone->getArgumentList(); + Function::ArgumentListType::iterator ArgListIt = ArgList.begin(); + Function::ArgumentListType::iterator ArgListEnd = ArgList.end(); + std::vector ParmKinds = V.getParameters(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + + User::user_iterator ArgUserIt = ArgListIt->user_begin(); + User::user_iterator ArgUserEnd = ArgListIt->user_end(); + unsigned ParmIdx = ArgListIt->getArgNo(); + SmallVector LinearParmUsers; + + if (ParmKinds[ParmIdx].isLinear()) { + + int Stride = ParmKinds[ParmIdx].getStride(); + + for (; ArgUserIt != ArgUserEnd; ++ArgUserIt) { + + // Collect all uses of the parameter so that they can later be used to + // apply the stride. + Instruction *ParmUser = dyn_cast(*ArgUserIt); + if (StoreInst *ParmStore = dyn_cast(ParmUser)) { + + // This code traces the store of the parameter to its associated + // alloca. Then, we look for a load from that alloca to a temp. This + // is the value we need to add the stride to. This is for when + // Mem2Reg has not been run. + AllocaInst *Alloca = dyn_cast(ArgUserIt->getOperand(1)); + + if (Alloca) { + for (auto *AU : Alloca->users()) { + + LoadInst *ParmLoad = dyn_cast(AU); + + if (ParmLoad) { + // The parameter is being loaded from an alloca to a new SSA + // temp. We must replace the users of this load with an + // instruction that adds the result of this load with the + // stride. + LinearParmUsers.push_back(ParmLoad); + } + } + } else { + // Mem2Reg has run, so the parameter is directly referenced in the + // store instruction. + LinearParmUsers.push_back(ParmStore); + } + } else { + // Mem2Reg has registerized the parameters, so users of it will use + // it directly, and not through a load of the parameter. + LinearParmUsers.push_back(ParmUser); + } + + for (unsigned I = 0; I < LinearParmUsers.size(); I++) { + // For each user of parameter: + + // We must deal with two cases here, based on whether Mem2Reg has been + // run. + // + // Example: + // + // __declspec(vector(linear(i:1),uniform(x),vectorlength(4))) + // extern int foo(int i, int x) { + // return (x + i); + // } + // + // 1) We are loading the parameter from an alloca and the SSA temp as + // as a result of the load is what we need to add the stride to. + // Then, any users of that temp must be replaced. The only load + // instructions put in the collection above are guaranteed to be + // associated with the parameter's alloca. Thus, we only need to + // check to see if a load is in the map to know what to do. + // + // Before Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // store i32 %x, i32* %x.addr, align 4 + // %0 = load i32, i32* %x.addr, align 4 + // %1 = load i32, i32* %i.addr, align 4 <--- %i + // %add = add nsw i32 %0, %1 <--- replace %1 with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // After Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // store i32 %x, i32* %x.addr, align 4 + // %0 = load i32, i32* %x.addr, align 4 + // %1 = load i32, i32* %i.addr, align 4 + // %stride.mul = mul i32 1, %index + // %stride.add = add i32 %1, %stride.mul <--- stride + // %add = add nsw i32 %0, %stride.add <--- new %i with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // 2) The user uses the parameter directly, and so we must apply the + // stride directly to the parameter. Any users of the parameter + // must then be updated. + // + // Before Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // %add = add nsw i32 %x, %i <-- direct usage of %i + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // After Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // %stride.mul = mul i32 1, %index + // %stride.add = add i32 %i, %stride.mul <--- stride + // %add = add nsw i32 %x, %stride.add <--- new %i with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + + Instruction *StrideInst = + generateStrideForParameter(Clone, &*ArgListIt, LinearParmUsers[I], + Stride, Phi); + + SmallVector InstsToUpdate; + Value *ParmUser; + + if (isa(LinearParmUsers[I])) { + // Case 1 + ParmUser = LinearParmUsers[I]; + User::user_iterator StrideUserIt = LinearParmUsers[I]->user_begin(); + User::user_iterator StrideUserEnd = LinearParmUsers[I]->user_end(); + + // Find the users of the redefinition of the parameter so that we + // can apply the stride to those instructions. + for (; StrideUserIt != StrideUserEnd; ++StrideUserIt) { + + Instruction *StrideUser = dyn_cast(*StrideUserIt); + if (StrideUser != StrideInst) { + // We've already inserted the stride which is now also a user of + // the parameter, so don't update that instruction. Otherwise, + // we'll create a self reference. Hence, why we don't use + // replaceAllUsesWith(). + InstsToUpdate.push_back(StrideUser); + } + } + } else { + // Case 2 + ParmUser = &*ArgListIt; + InstsToUpdate.push_back(LinearParmUsers[I]); + } + + // Replace the old references to the parameter with the instruction + // that applies the stride. + for (unsigned J = 0; J < InstsToUpdate.size(); ++J) { + unsigned NumOps = InstsToUpdate[J]->getNumOperands(); + for (unsigned K = 0; K < NumOps; ++K) { + if (InstsToUpdate[J]->getOperand(K) == ParmUser) { + InstsToUpdate[J]->setOperand(K, StrideInst); + } + } + } + } + } + } + } + + DEBUG(dbgs() << "After Linear Updates\n"); + DEBUG(Clone->dump()); +} + +void VecClone::updateReturnBlockInstructions( + Function *Clone, + BasicBlock *ReturnBlock, + Instruction *ExpandedReturn) +{ + // If the vector function returns void, then there is no need to do any + // packing. The only instruction in the ReturnBlock is 'ret void', so + // we can just leave this instruction and we're done. + if (Clone->getReturnType()->isVoidTy()) return; + + // Collect all instructions in the return basic block. They will be removed. + SmallVector InstToRemove; + BasicBlock::iterator InstIt = ReturnBlock->begin(); + BasicBlock::iterator InstEnd = ReturnBlock->end(); + + for (; InstIt != InstEnd; ++InstIt) { + InstToRemove.push_back(&*InstIt); + } + + // Remove all instructions from the return block. These will be replaced + // with the instructions necessary to return a vector temp. The verifier + // will complain if we remove the definitions of users first, so remove + // instructions from the bottom up. + for (int I = InstToRemove.size() - 1; I >= 0; I--) { + InstToRemove[I]->eraseFromParent(); + } + + // Pack up the elements into a vector temp and return it. If the return + // vector was bitcast to a pointer to the element type, we must bitcast to + // vector before returning. + Instruction *Return; + if (dyn_cast(ExpandedReturn)) { + // Operand 0 is the actual alloc reference in the bitcast. + AllocaInst *Alloca = dyn_cast(ExpandedReturn->getOperand(0)); + PointerType *PtrVecType = + PointerType::get(Clone->getReturnType(), + Alloca->getType()->getAddressSpace()); + Twine CastName = "vec." + ExpandedReturn->getName(); + BitCastInst *BitCast = new BitCastInst(ExpandedReturn, PtrVecType, + CastName, ReturnBlock); + Return = BitCast; + } else { + Return = ExpandedReturn; + } + + LoadInst *VecReturn = new LoadInst(Return, "vec.ret", ReturnBlock); + ReturnInst::Create(Clone->getContext(), VecReturn, ReturnBlock); + + DEBUG(dbgs() << "After Return Block Update\n"); + DEBUG(Clone->dump()); +} + +int VecClone::getParmIndexInFunction(Function *F, Value *Parm) +{ + Function::arg_iterator ArgIt = F->arg_begin(); + Function::arg_iterator ArgEnd = F->arg_end(); + for (unsigned Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) { + if (Parm == &*ArgIt) return Idx; + } + + return -1; +} + +bool VecClone::isSimpleFunction(Function *Clone, VectorVariant &V, + ReturnInst *ReturnOnly) +{ + // For really simple functions, there is no need to go through the process + // of inserting a loop. + + // Example: + // + // void foo(void) { + // return; + // } + // + // No need to insert a loop for this case since it's basically a no-op. Just + // clone the function and return. It's possible that we could have some code + // inside of a vector function that modifies global memory. Let that case go + // through. + if (ReturnOnly && Clone->getReturnType()->isVoidTy()) { + return true; + } + + return false; +} + +void VecClone::insertSplitForMaskedVariant(Function *Clone, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi) +{ + BasicBlock *LoopThenBlock = + LoopBlock->splitBasicBlock(LoopBlock->getFirstNonPHI(), + "simd.loop.then"); + + BasicBlock *LoopElseBlock = BasicBlock::Create(Clone->getContext(), + "simd.loop.else", + Clone, LoopExitBlock); + + BranchInst::Create(LoopExitBlock, LoopElseBlock); + + BitCastInst *BitCast = dyn_cast(Mask); + PointerType *BitCastType = dyn_cast(BitCast->getType()); + Type *PointeeType = BitCastType->getElementType(); + + GetElementPtrInst *MaskGep = + GetElementPtrInst::Create(PointeeType, Mask, Phi, "mask.gep", + LoopBlock->getTerminator()); + + Twine LoadName = "mask.parm"; + LoadInst *MaskLoad = new LoadInst(MaskGep, LoadName, + LoopBlock->getTerminator()); + + Type *CompareTy = MaskLoad->getType(); + Instruction *MaskCmp; + Constant* Zero; + + // Generate the compare instruction to see if the mask bit is on. In ICC, we + // use the movemask intrinsic which takes both float/int mask registers and + // converts to an integer scalar value, one bit representing each element. + // AVR construction will be complicated if this intrinsic is introduced here, + // so the current solution is to just generate either an integer or floating + // point compare instruction for now. This may change anyway if we decide to + // go to a vector of i1 values for the mask. I suppose this would be one + // positive reason to use vector of i1. + if (CompareTy->isIntegerTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0); + MaskCmp = new ICmpInst(LoopBlock->getTerminator(), CmpInst::ICMP_NE, + MaskLoad, Zero, "mask.cond"); + } else if (CompareTy->isFloatingPointTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0.0); + MaskCmp = new FCmpInst(LoopBlock->getTerminator(), CmpInst::FCMP_UNE, + MaskLoad, Zero, "mask.cond"); + } else { + assert(0 && "Unsupported mask compare"); + } + + TerminatorInst *Term = LoopBlock->getTerminator(); + Term->eraseFromParent(); + BranchInst::Create(LoopThenBlock, LoopElseBlock, MaskCmp, LoopBlock); + + DEBUG(dbgs() << "After Split Insertion For Masked Variant\n"); + DEBUG(Clone->dump()); +} + +void VecClone::removeScalarAllocasForVectorParams( + std::vector &VectorParmMap) +{ + std::vector::iterator VectorParmMapIt; + + for (auto VectorParmMapIt : VectorParmMap) { + Value *Parm = VectorParmMapIt->VectorParm; + if (AllocaInst *ScalarAlloca = dyn_cast(Parm)) { + ScalarAlloca->eraseFromParent(); + } + } +} + +void VecClone::disableLoopUnrolling(BasicBlock *Latch) +{ + // Set disable unroll metadata on the conditional branch of the loop latch + // for the simd loop. The following is an example of what the loop latch + // and Metadata will look like. The !llvm.loop marks the beginning of the + // loop Metadata and is always placed on the terminator of the loop latch. + // (i.e., simd.loop.exit in this case). According to LLVM documentation, to + // properly set the loop Metadata, the 1st operand of !16 must be a self- + // reference to avoid some type of Metadata merging conflicts that have + // apparently arisen in the past. This is part of LLVM history that I do not + // know. Also, according to LLVM documentation, any Metadata nodes referring + // to themselves are marked as distinct. As such, all Metadata corresponding + // to a loop belongs to that loop alone and no sharing of Metadata can be + // done across different loops. + // + // simd.loop.exit: ; preds = %simd.loop, %if.else, %if.then + // %indvar = add nuw i32 %index, 1 + // %vl.cond = icmp ult i32 %indvar, 2 + // br i1 %vl.cond, label %simd.loop, label %simd.end.region, !llvm.loop !16 + // + // !16 = distinct !{!16, !17} + // !17 = !{!"llvm.loop.unroll.disable"} + + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = Latch->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + Latch->getTerminator()->setMetadata("llvm.loop", NewLoopID); +} + +bool VecClone::runOnModule(Module &M) { + + DEBUG(dbgs() << "\nExecuting SIMD Function Cloning ...\n\n"); + + FunctionVariants FunctionsToVectorize; + getFunctionsToVectorize(M, FunctionsToVectorize); + if (FunctionsToVectorize.empty()) { + // No vector variants for this function exist. + return false; + } + + for (auto& pair : FunctionsToVectorize) { + + Function& F = *pair.first; + DeclaredVariants &DeclaredVariants = pair.second; + + for (auto& DeclaredVariant : DeclaredVariants) { + + VectorVariant Variant(DeclaredVariant); + + // Clone the original function. + DEBUG(dbgs() << "Before SIMD Function Cloning\n"); + DEBUG(F.dump()); + Function *Clone = cloneFunction(F, Variant); + Function::iterator EntryBlock = Clone->begin(); + BasicBlock::iterator FirstInst = EntryBlock->begin(); + ReturnInst *ReturnOnly = dyn_cast(FirstInst); + + if (isSimpleFunction(Clone, Variant, ReturnOnly)) { + continue; + } + + BasicBlock *LoopBlock = splitEntryIntoLoop(Clone, Variant, &*EntryBlock); + BasicBlock *ReturnBlock = splitLoopIntoReturn(Clone, LoopBlock); + BasicBlock *LoopExitBlock = createLoopExit(Clone, ReturnBlock); + PHINode *Phi = createPhiAndBackedgeForLoop(Clone, &*EntryBlock, + LoopBlock, LoopExitBlock, + ReturnBlock, + Variant.getVlen()); + + // At this point, we've gathered some parameter information and have + // restructured the function into an entry block, a set of blocks + // forming the loop, a loop exit block, and a return block. Now, + // we can go through and update instructions since we know what + // is part of the loop. + + // VectorParmMap contains the mapping of the parameter to the bitcast + // instruction that casts the vector alloca for vector parameters + // to a scalar pointer for use in the simd loop. When parameters are + // registerized, the Value* in the map correponds directly to the + // function parameter. When parameters are not registerized, then the + // Value* in the map is the original scalar alloca before expansion. + // Later, users of the parameter, either directly or through the alloca, + // are replaced with a gep using the bitcast of the vector alloca for the + // parameter and the current loop induction variable value. + // + // IMPORTANT NOTE: std::vector was used here because later we emit LLVM + // instructions using the members of ParmRef, and these instructions + // should be ordered consistently for easier testability. + + std::vector VectorParmMap; + + // Create a new vector alloca instruction for all vector parameters and + // return. For parameters, replace the initial store to the old alloca + // with the vector one. Users of the old alloca within the loop will be + // replaced with a gep using this address along with the proper loop + // index. + + Instruction *Mask = NULL; + Instruction *ExpandedReturn = + expandVectorParametersAndReturn(Clone, Variant, &Mask, &*EntryBlock, + LoopBlock, ReturnBlock, + VectorParmMap); + updateScalarMemRefsWithVector(Clone, F, &*EntryBlock, ReturnBlock, Phi, + VectorParmMap); + + // Update any linear variables with the appropriate stride. This function + // will insert a mul/add sequence before the use of the parameter. For + // linear pointer parameters, the stride calculation is just a mul + // instruction using the loop induction var and the stride value on the + // parameter. This mul instruction is then used as the index of the gep + // that will be inserted before the next use of the parameter. The + // function also updates the users of the parameter with the new + // calculation involving the stride. + updateLinearReferences(Clone, F, Variant, Phi); + + // Remove the old scalar instructions associated with the return and + // replace with packing instructions. + updateReturnBlockInstructions(Clone, ReturnBlock, ExpandedReturn); + + // Remove the old scalar allocas associated with vector parameters since + // these have now been replaced with vector ones. + removeScalarAllocasForVectorParams(VectorParmMap); + VectorParmMap.clear(); + + // If this is the masked vector variant, insert the mask condition and + // if/else blocks. + if (Variant.isMasked()) { + insertSplitForMaskedVariant(Clone, LoopBlock, LoopExitBlock, Mask, Phi); + } + + DEBUG(dbgs() << "After SIMD Function Cloning\n"); + DEBUG(Clone->dump()); + + // Disable unrolling from kicking in on the simd loop. + disableLoopUnrolling(LoopExitBlock); + + } // End of function cloning for the variant + } // End of function cloning for all variants + + return true; // LLVM IR has been modified +} + +void VecClone::print(raw_ostream &OS, const Module *M) const { + // TODO +} + +ModulePass *llvm::createVecClonePass() { + return new llvm::VecClone(); +} + +char VecClone::ID = 0; + +static const char lv_name[] = "VecClone"; +INITIALIZE_PASS_BEGIN(VecClone, SV_NAME, lv_name, + false /* modifies CFG */, false /* transform pass */) +INITIALIZE_PASS_END(VecClone, SV_NAME, lv_name, + false /* modififies CFG */, false /* transform pass */) Index: test/Transforms/VecClone/broadcast.ll =================================================================== --- test/Transforms/VecClone/broadcast.ll +++ test/Transforms/VecClone/broadcast.ll @@ -0,0 +1,19 @@ +; Check broadcast of a constant. The store of the constant should be moved inside of the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4_foo +; CHECK: simd.loop: +; CHECK: store i32 99, i32* %ret.cast.gep + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo() #0 { +entry: + ret i32 99 +} + +attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4_foo" "_ZGVbN4_foo" "_ZGVcM8_foo" "_ZGVcN8_foo" "_ZGVdM8_foo" "_ZGVdN8_foo" "_ZGVeM16_foo" "_ZGVeN16_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/convert_linear.ll =================================================================== --- test/Transforms/VecClone/convert_linear.ll +++ test/Transforms/VecClone/convert_linear.ll @@ -0,0 +1,32 @@ +; Check handling of upconverting a linear (variable %i) to ensure stride calculation +; is inserted correctly and the old convert (sext) uses the stride instead of the old +; reference to %i. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN2vl_foo +; CHECK: simd.loop: +; CHECK: %0 = load i32, i32* %i.addr +; CHECK-NEXT: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %0, %stride.mul +; CHECK-NEXT: %conv = sext i32 %stride.add to i64 + +; ModuleID = 'convert.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i64 @foo(i64 %x, i32 %i) #0 { +entry: + %x.addr = alloca i64, align 8 + %i.addr = alloca i32, align 4 + store i64 %x, i64* %x.addr, align 8 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %x.addr, align 8 + %add = add nsw i64 %conv, %1 + ret i64 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM2vl_foo" "_ZGVbN2vl_foo" "_ZGVcM4vl_foo" "_ZGVcN4vl_foo" "_ZGVdM4vl_foo" "_ZGVdN4vl_foo" "_ZGVeM8vl_foo" "_ZGVeN8vl_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/external_array.ll =================================================================== --- test/Transforms/VecClone/external_array.ll +++ test/Transforms/VecClone/external_array.ll @@ -0,0 +1,35 @@ +; Check to see that we are applying the correct updated linear index for an external array access gep. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4ul_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %idxprom = sext i32 %stride.add to i64 +; CHECK: %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom +; CHECK: store i32 %0, i32* %arrayidx + +; ModuleID = 'external_array_assign.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@ext_a = common global [128 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @foo(i32 %x, i32 %i) #0 { +entry: + %x.addr = alloca i32, align 4 + %i.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom + store i32 %0, i32* %arrayidx, align 4 + ret void +} + +attributes #0 = { norecurse nounwind uwtable "_ZGVbM4ul_foo" "_ZGVbN4ul_foo" "_ZGVcM8ul_foo" "_ZGVcN8ul_foo" "_ZGVdM8ul_foo" "_ZGVdN8ul_foo" "_ZGVeM16ul_foo" "_ZGVeN16ul_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear.ll =================================================================== --- test/Transforms/VecClone/linear.ll +++ test/Transforms/VecClone/linear.ll @@ -0,0 +1,29 @@ +; Check to see that the linear parameter i is updated with the correct stride, indicated by a mul/add instruction sequence after the load. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %add = add nsw i32 %0, %stride.add + +; ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %i.addr = alloca i32, align 4 + %x.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %x, i32* %x.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4lu_foo" "_ZGVbN4lu_foo" "_ZGVcM8lu_foo" "_ZGVcN8lu_foo" "_ZGVdM8lu_foo" "_ZGVdN8lu_foo" "_ZGVeM16lu_foo" "_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear_mem2reg.ll =================================================================== --- test/Transforms/VecClone/linear_mem2reg.ll +++ test/Transforms/VecClone/linear_mem2reg.ll @@ -0,0 +1,22 @@ +; Check to see that the linear parameter i is updated with the correct stride when Mem2Reg is on. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %i, %stride.mul +; CHECK-NEXT: %add = add nsw i32 %x, %stride.add + +;ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %add = add nsw i32 %x, %i + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "_ZGVbM4lu_foo" "_ZGVbN4lu_foo" "_ZGVcM8lu_foo" "_ZGVcN8lu_foo" "_ZGVdM8lu_foo" "_ZGVdN8lu_foo" "_ZGVeM16lu_foo" "_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/struct_linear_ptr.ll =================================================================== --- test/Transforms/VecClone/struct_linear_ptr.ll +++ test/Transforms/VecClone/struct_linear_ptr.ll @@ -0,0 +1,40 @@ +; Test that the stride is being applied correctly to struct field accesses. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4l_foo +; CHECK: simd.loop: +; CHECK: %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %0, i32 %stride.mul{{.*}} +; CHECK: %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 0 +; CHECK: %1 = load float, float* %field1, align 8 +; CHECK: %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %2, i32 %stride.mul{{.*}} +; CHECK: %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 4 +; CHECK: %3 = load float, float* %field5, align 8 +; CHECK: %add = fadd float %1, %3 + +; ModuleID = 'struct_linear_ptr.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.my_struct = type { float, i8, i32, i16, float, i64 } + +; Function Attrs: nounwind uwtable +define float @foo(%struct.my_struct* %s) #0 { +entry: + %s.addr = alloca %struct.my_struct*, align 8 + store %struct.my_struct* %s, %struct.my_struct** %s.addr, align 8 + %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %0, i32 0, i32 0 + %1 = load float, float* %field1, align 8 + %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %2, i32 0, i32 4 + %3 = load float, float* %field5, align 8 + %add = fadd float %1, %3 + ret float %add +} + +attributes #0 = { norecurse nounwind readonly uwtable "_ZGVbM4l_foo" "_ZGVbN4l_foo" "_ZGVcM8l_foo" "_ZGVcN8l_foo" "_ZGVdM8l_foo" "_ZGVdN8l_foo" "_ZGVeM16l_foo" "_ZGVeN16l_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum.ll +++ test/Transforms/VecClone/two_vec_sum.ll @@ -0,0 +1,59 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the non-masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; CHECK-LABEL: <4 x i32> @_ZGVbN4vv_vec_sum(<4 x i32> %i, <4 x i32> %j) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mask.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mask.ll +++ test/Transforms/VecClone/two_vec_sum_mask.ll @@ -0,0 +1,71 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; Begin non-masked variant checking + +; CHECK-LABEL: <4 x i32> @_ZGVbM4vv_vec_sum(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.mask = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i, align 4 +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j, align 4 +; CHECK-NEXT: store <4 x i32> %mask, <4 x i32>* %vec.mask +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: %mask.cast = bitcast <4 x i32>* %vec.mask to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %mask.gep = getelementptr i32, i32* %mask.cast, i32 %index +; CHECK-NEXT: %mask.parm = load i32, i32* %mask.gep +; CHECK-NEXT: %mask.cond = icmp ne i32 %mask.parm, 0 +; CHECK-NEXT: br i1 %mask.cond, label %simd.loop.then, label %simd.loop.else + +; CHECK: simd.loop.then: +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.else: +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mem2reg.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mem2reg.ll +++ test/Transforms/VecClone/two_vec_sum_mem2reg.ll @@ -0,0 +1,31 @@ +; Check to be sure that when Mem2Reg is on that all updates to instructions referring to the original +; parameter are updated correctly. When Mem2Reg is on, instructions will refer to the parameters +; directly and not through a load, which is why this is tested separately. + +; Note: the LLVM IR used as input to this test has already had Mem2Reg applied to it, so no need to +; do that here. This happens at higher optimization levels such as -O2. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking + +; CHECK-LABEL: @_ZGVbN4vv_vec_sum +; CHECK: simd.loop: +; CHECK: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK: %vec.i.elem = load i32, i32* %vec.i.cast.gep +; CHECK: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK: %vec.j.elem = load i32, i32* %vec.j.cast.gep +; CHECK: %add = add nsw i32 %vec.i.elem, %vec.j.elem + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %add = add nsw i32 %i, %j + ret i32 %add +} + +attributes #0 = { nounwind uwtable "_ZGVbM4vv_vec_sum" "_ZGVbN4vv_vec_sum" "_ZGVcM8vv_vec_sum" "_ZGVcN8vv_vec_sum" "_ZGVdM8vv_vec_sum" "_ZGVdN8vv_vec_sum" "_ZGVeM16vv_vec_sum" "_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/uniform.ll =================================================================== --- test/Transforms/VecClone/uniform.ll +++ test/Transforms/VecClone/uniform.ll @@ -0,0 +1,25 @@ +; Check to make sure the initial parameter store of the uniform parameter is sunk into the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: <4 x i32> @_ZGVbN4u_foo(i32 %b) +; CHECK: simd.loop: +; CHECK: store i32 %b + +; ModuleID = 'uniform.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %b) #0 { +entry: + %b.addr = alloca i32, align 4 + store i32 %b, i32* %b.addr, align 4 + %0 = load i32, i32* %b.addr, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %b.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + ret i32 %1 +} + +attributes #0 = { nounwind uwtable "_ZGVbM4u_foo" "_ZGVbN4u_foo" "_ZGVcM8u_foo" "_ZGVcN8u_foo" "_ZGVdM8u_foo" "_ZGVdN8u_foo" "_ZGVeM16u_foo" "_ZGVeN16u_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/void_foo.ll =================================================================== --- test/Transforms/VecClone/void_foo.ll +++ test/Transforms/VecClone/void_foo.ll @@ -0,0 +1,19 @@ +; Check to make sure we can handle void foo() function + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: void @_ZGVbN4_foo() +; CHECK: entry: +; CHECK: ret void + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @foo() #0 { +entry: + ret void +} + +attributes #0 = { nounwind uwtable "_ZGVbM4_foo1" "_ZGVbN4_foo1" "_ZGVcM8_foo1" "_ZGVcN8_foo1" "_ZGVdM8_foo1" "_ZGVdN8_foo1" "_ZGVeM16_foo1" "_ZGVeN16_foo1" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: tools/bugpoint/bugpoint.cpp =================================================================== --- tools/bugpoint/bugpoint.cpp +++ tools/bugpoint/bugpoint.cpp @@ -130,6 +130,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); #ifdef LINK_POLLY_INTO_TOOLS polly::initializePollyPasses(Registry); Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -357,6 +357,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. initializeCodeGenPreparePass(Registry);