Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -628,6 +628,30 @@ /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \brief Targets defined in the vector function ABI. + enum TargetProcessor { + Pentium4, // ISA extension = SSE2, ISA class = XMM + Pentium4SSE3, // ISA extension = SSE3, ISA class = XMM + Core2DuoSSSE3, // ISA extension = SSSE3, ISA class = XMM + Core2DuoSSE41, // ISA extension = SSE4_1, ISA class = XMM + CoreI7SSE42, // ISA extension = SSE4_2, ISA class = XMM + Core2ndGenAVX, // ISA extension = AVX, ISA class = YMM1 + Core3rdGenAVX, // ISA extension = AVX, ISA class = YMM1 + Core4thGenAVX, // ISA extension = AVX2, ISA class = YMM2 + Mic, // ISA extension = Xeon Phi, ISA class = MIC(ZMM) + FutureCpu22, // ISA extension = AVX512, ISA class = ZMM + FutureCpu23, // ISA extension = AVX512, ISA class = ZMM + }; + + /// ISA classes defined in the vector function ABI. + enum ISAClass { + XMM, // (SSE2) + YMM1, // (AVX1) + YMM2, // (AVX2) + ZMM, // (MIC) + ISAClassesNum + }; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -887,6 +911,21 @@ unsigned ChainSizeInBytes, VectorType *VecTy) const; + /// \returns The maximum vector register width based on ISAClass \p Class, + /// as defined in the vector function ABI. + unsigned maximumSizeofISAClassVectorRegister(ISAClass Class, Type *Ty) const; + + /// \returns The encoded ISA class for the mangled vector variant name based + /// on \p IsaClass. + char encodeISAClass(ISAClass IsaClass) const; + + /// \returns The ISAClass from the character encoded \p IsaClass of the + /// mangled vector variant function name. + ISAClass decodeISAClass(char IsaClass) const; + + /// \returns The target legalized type of \P Ty based on ISAClass \p IsaClass. + Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const; + /// Flags describing the kind of vector reduction. struct ReductionFlags { ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {} @@ -1088,6 +1127,11 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual unsigned maximumSizeofISAClassVectorRegister(ISAClass Class, + Type *Ty) const = 0; + virtual char encodeISAClass(ISAClass IsaClass) const = 0; + virtual ISAClass decodeISAClass(char IsaClass) const = 0; + virtual Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; @@ -1450,6 +1494,19 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + unsigned maximumSizeofISAClassVectorRegister(ISAClass Class, + Type *Ty) const override { + return Impl.maximumSizeofISAClassVectorRegister(Class, Ty); + } + char encodeISAClass(ISAClass IsaClass) const override { + return Impl.encodeISAClass(IsaClass); + } + ISAClass decodeISAClass(char IsaClass) const override { + return Impl.decodeISAClass(IsaClass); + } + Type* promoteToSupportedType(Type *Ty, ISAClass IsaClass) const override { + return Impl.promoteToSupportedType(Ty, IsaClass); + } bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -529,6 +529,23 @@ return VF; } + unsigned maximumSizeofISAClassVectorRegister( + TTI::ISAClass I, Type *Ty) const { + return 0; + } + + char encodeISAClass(TTI::ISAClass IsaClass) const { + return '?'; + } + + TTI::ISAClass decodeISAClass(char IsaClass) const { + return TTI::ISAClassesNum; + } + + Type* promoteToSupportedType(Type *Ty, TTI::ISAClass IsaClass) const { + return Ty; + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; Index: include/llvm/Analysis/VectorVariant.h =================================================================== --- include/llvm/Analysis/VectorVariant.h +++ include/llvm/Analysis/VectorVariant.h @@ -0,0 +1,231 @@ +//===---- llvm/Transforms/VectorVariant.h - Vector utilities -*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This header file defines the VectorVariant class and implements the encoding +/// and decoding utilities for VectorVariant objects. Multiple VectorVariant +/// objects can be created (masked, non-masked, etc.) and associated with the +/// original scalar function. These objects are then used to clone new functions +/// that can be vectorized. This class follows the standards defined in the +/// vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H +#define LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H + +#include +#include +#include +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define STRIDE_KIND 's' +#define LINEAR_KIND 'l' +#define UNIFORM_KIND 'u' +#define VECTOR_KIND 'v' + +#define NOT_ALIGNED 1 + +#define POSITIVE 1 +#define NEGATIVE -1 + +class VectorKind { + +public: + VectorKind(char K, int S, int A = NOT_ALIGNED) { + + assert((S == notAValue() || K == STRIDE_KIND || K == LINEAR_KIND) && + "only linear vectors have strides"); + + assert((K != LINEAR_KIND || S != notAValue()) && + "linear vectors must have a stride"); + + assert((K != STRIDE_KIND || S != notAValue()) && + "variable stride vectors must have a stride"); + + assert((K != STRIDE_KIND || S >= 0) && + "variable stride position must be non-negative"); + + assert(A > 0 && "alignment must be positive"); + + Kind = K; + Stride = S; + Alignment = A; + } + + VectorKind(const VectorKind &Other) { + Kind = Other.Kind; + Stride = Other.Stride; + Alignment = Other.Alignment; + } + + /// \brief Is the stride for a linear parameter a uniform variable? (i.e., + /// the stride is stored in a variable but is uniform) + bool isVariableStride() { return Kind == STRIDE_KIND; } + + /// \brief Is the stride for a linear variable non-unit stride? + bool isNonUnitStride() { return Kind == LINEAR_KIND && Stride != 1; } + + /// \brief Is the stride for a linear variable unit stride? + bool isUnitStride() { return Kind == LINEAR_KIND && Stride == 1; } + + /// \brief Is this a linear parameter? + bool isLinear() { + return isVariableStride() || isNonUnitStride() || isUnitStride(); + } + + /// \brief Is this a uniform parameter? + bool isUniform() { return Kind == UNIFORM_KIND; } + + /// \brief Is this a vector parameter? + bool isVector() { return Kind == VECTOR_KIND; } + + /// \brief Is the parameter aligned? + bool isAligned() { return Alignment != NOT_ALIGNED; } + + /// \brief Get the stride associated with a linear parameter. + int getStride() { return Stride; } + + /// \brief Get the alignment associated with a linear parameter. + int getAlignment() { return Alignment; } + + /// \brief Represents a don't care value for strides of parameters other + /// than linear parameters. + static int notAValue() { return -1; } + + /// \brief Encode the parameter information into a mangled string + /// corresponding to the standards defined in the vector function ABI. + std::string encode() { + std::stringstream SST; + SST << Kind; + + if (isNonUnitStride()) { + if (Stride >= 0) + SST << Stride; + else + SST << "n" << -Stride; + } + + if (isVariableStride()) + SST << Stride; + + if (isAligned()) + SST << 'a' << Alignment; + + return SST.str(); + } + +private: + char Kind; // linear, uniform, vector + int Stride; + int Alignment; +}; + +class VectorVariant { + +private: + TargetTransformInfo *TTI; + TargetTransformInfo::ISAClass Isa; + bool Mask; + unsigned int Vlen; + std::vector Parameters; + + static std::string prefix() { return "_ZGV"; } + +public: + VectorVariant(StringRef FuncName, TargetTransformInfo *TTI); + + /// \brief Get the ISA corresponding to this vector variant. + TargetTransformInfo::ISAClass getISA() { return Isa; } + + /// \brief Is this a masked vector function variant? + bool isMasked() { return Mask; } + + /// \brief Get the vector length of the vector variant. + unsigned int getVlen() { return Vlen; } + + /// \brief Get the parameters of the vector variant. + std::vector &getParameters() { return Parameters; } + + /// \brief Build the mangled name for the vector variant. This function + /// builds a mangled name by including the encodings for the ISA class, + /// mask information, and all parameters. + std::string encode() { + + std::stringstream SST; + SST << prefix() << TTI->encodeISAClass(Isa) << encodeMask(Mask) << Vlen; + + std::vector::iterator It = Parameters.begin(); + std::vector::iterator End = Parameters.end(); + + if (isMasked()) + End--; // mask parameter is not encoded + + for (; It != End; ++It) + SST << (*It).encode(); + + SST << "_"; + + return SST.str(); + } + + /// \brief Generate a function name corresponding to a vector variant. + std::string generateFunctionName(StringRef ScalarFuncName) { + + static StringRef ManglingPrefix("_Z"); + std::string Name = encode(); + + if (ScalarFuncName.startswith(ManglingPrefix)) + return Name + ScalarFuncName.drop_front(ManglingPrefix.size()).str(); + else + return Name + ScalarFuncName.str(); + } + + /// \brief Some targets do not support particular types, so promote to a type + /// that is supported. + Type *promoteToSupportedType(Type *Ty) { + return TTI->promoteToSupportedType(Ty, getISA()); + } + + /// \brief Check to see if this is a vector variant based on the function + /// name. + static bool isVectorVariant(StringRef FuncName) { + return FuncName.startswith(prefix()); + } + + /// \brief Encode the mask information for the mangled variant name. + static char encodeMask(bool EncodeMask) { + + return EncodeMask ? 'M' : 'N'; + } + + /// \brief Decode the mask information from the mangled variant name. + static bool decodeMask(char MaskToDecode) { + + switch (MaskToDecode) { + case 'M': + return true; + case 'N': + return false; + } + + llvm_unreachable("unsupported mask"); + } + + /// \brief Calculate the vector length for the vector variant. + unsigned calcVlen(TargetTransformInfo::ISAClass I, Type *Ty); +}; + +#endif // LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -377,6 +377,7 @@ void initializeWriteBitcodePassPass(PassRegistry&); void initializeWriteThinLTOBitcodePass(PassRegistry&); void initializeXRayInstrumentationPass(PassRegistry&); +void initializeVecClonePass(PassRegistry&); } // end namespace llvm Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -48,6 +48,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/VecClone.h" #include "llvm/Transforms/Vectorize.h" #include @@ -207,6 +208,7 @@ (void) llvm::createFloat2IntPass(); (void) llvm::createEliminateAvailableExternallyPass(); (void) llvm::createScalarizeMaskedMemIntrinPass(); + (void) llvm::createVecClonePass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); Index: include/llvm/Transforms/Utils/VecClone.h =================================================================== --- include/llvm/Transforms/Utils/VecClone.h +++ include/llvm/Transforms/Utils/VecClone.h @@ -0,0 +1,236 @@ +//===-------------- VecClone.h - Class definition -*- C++ -*---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This file defines the VecClone pass class. +/// +// ===--------------------------------------------------------------------=== // + +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/Analysis/VectorVariant.h" + +#ifndef LLVM_TRANSFORMS_VPO_VECCLONE_H +#define LLVM_TRANSFORMS_VPO_VECCLONE_H + +enum InstType { + ALLOCA = 0, + STORE, + BITCAST +}; + +namespace llvm { + +class ModulePass; + +/// \brief Represents the mapping of a vector parameter to its corresponding +/// vector to scalar type cast instruction. This done so that the scalar loop +/// inserted by this pass contains instructions that are in scalar form so that +/// the loop can later be vectorized. +struct ParmRef { + // Represents the parameter in one of two forms: + // 1) A vector alloca instruction if the parameter has not been registerized. + // 2) The parameter as the Value* passed in via the function call. + Value *VectorParm; + + // Represents the vector parameter cast from a vector type to scalar type. + Instruction *VectorParmCast; +}; + +class VecClone : public ModulePass { + + private: + + /// \brief Contains the names of the declared vector function variants + typedef std::vector DeclaredVariants; + + /// \brief Contains a mapping of a function to its vector function variants + typedef std::map FunctionVariants; + + /// \brief Determine the characteristic type of the vector function as + /// specified according to the vector function ABI. + Type* calcCharacteristicType(Function& F, VectorVariant& Variant); + + /// \brief Get all functions marked for vectorization in module and their + /// list of variants. + void getFunctionsToVectorize( + Module &M, std::map > &FuncVars); + + /// \brief Returns a floating point or integer constant depending on Ty. + template + Constant* getConstantValue(Type *Ty, LLVMContext &Context, T Val); + + /// \brief Return true if the function has a complex type for the return + /// or parameters. + bool hasComplexType(Function *F); + + /// \brief Make a copy of the function if it is marked as SIMD. + Function* CloneFunction(Function &F, VectorVariant &V); + + /// \brief Take the entry basic block for the function as split off a second + /// basic block that will form the loop entry. + BasicBlock* splitEntryIntoLoop(Function *Clone, VectorVariant &V, + BasicBlock *EntryBlock); + + /// \brief Take the loop entry basic block and split off a second basic + /// block into a new return basic block. + BasicBlock* splitLoopIntoReturn(Function *Clone, BasicBlock *LoopBlock); + + /// \brief Generate a basic block to test the loop exit condition. + BasicBlock* createLoopExit(Function *Clone, BasicBlock *ReturnBlock); + + /// \brief Update the predecessors of the return basic block. + void updateReturnPredecessors(Function *Clone, BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create the backedge from the loop exit basic block to the loop + /// entry block. + PHINode* createPhiAndBackedgeForLoop(Function *Clone, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock, + int VL); + + /// \brief Generate vector alloca instructions for vector parameters and + /// change the parameter types to vector types. Expand the return value of + /// the function to a vector type. This function returns the instruction + /// corresponding to the expanded return and the instruction corresponding + /// to the mask. + Instruction* expandVectorParametersAndReturn( + Function *Clone, + VectorVariant &V, + Instruction **Mask, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *ReturnBlock, + std::vector& ParmMap); + + /// \brief Expand the function parameters to vector types. This function + /// returns the instruction corresponding to the mask. + Instruction* expandVectorParameters( + Function *Clone, + VectorVariant &V, + BasicBlock *EntryBlock, + std::vector& ParmMap); + + /// \brief Expand the function's return value to a vector type. + Instruction* expandReturn(Function *Clone, BasicBlock *EntryBlock, + BasicBlock *LoopBlock, BasicBlock *ReturnBlock, + std::vector& ParmMap); + + /// \brief Update the old parameter references to with the new vector + /// references. + void updateScalarMemRefsWithVector( + Function *Clone, + Function &F, + BasicBlock *EntryBlock, + BasicBlock *ReturnBlock, + PHINode *Phi, + std::vector& ParmMap); + + /// \brief Update the values of linear parameters by adding the stride + /// before the use. + void updateLinearReferences(Function *Clone, Function &F, + VectorVariant &V, PHINode *Phi); + + /// \brief Update the instructions in the return basic block to return a + /// vector temp. + void updateReturnBlockInstructions(Function *Clone, BasicBlock *ReturnBlock, + Instruction *VecReturnAlloca); + + /// \brief Create a separate basic block to mark the begin and end of the + /// SIMD loop formed from the vector function. Essentially, this function + /// transfers the information from the SIMD function keywords and creates + /// new loop pragmas so that parameter information can be transferred to + /// the loop. + void insertDirectiveIntrinsics(Module& M, Function *Clone, Function &F, + VectorVariant &V, + BasicBlock *EntryBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create the basic block indicating the begin of the SIMD loop. + void insertBeginRegion(Module& M, Function *Clone, Function &F, + VectorVariant &V, BasicBlock *EntryBlock); + + /// \brief Create the basic block indicating the end of the SIMD loop. + void insertEndRegion(Module& M, Function *Clone, BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock); + + /// \brief Create a new vector alloca instruction for the return vector and + /// bitcast to the appropriate element type. + Instruction* createExpandedReturn(Function *F, BasicBlock *BB, + VectorType *ReturnType); + + /// \brief Return the position of the parameter in the function's parameter + /// list. + int getParmIndexInFunction(Function *F, Value *Parm); + + /// \brief Check to see if the function is simple enough that a loop does + /// not need to be inserted into the function. + bool isSimpleFunction(Function *Clone, VectorVariant &V, + ReturnInst *Return); + + /// \brief Inserts the if/else split and mask condition for masked SIMD + /// functions. + void insertSplitForMaskedVariant(Function *Clone, BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi); + + /// \brief Utility function to insert instructions with other instructions + /// of the same kind. + void insertInstruction(Instruction *Inst, BasicBlock *BB); + + /// \brief Utility function that generates instructions that calculate the + /// stride for a linear parameter. + Instruction* generateStrideForParameter(Function *Clone, Argument *Arg, + Instruction *ParmUser, int Stride, + PHINode *Phi); + + /// \brief Utility function that returns true if Inst is a store of a vector + /// or linear parameter. + bool isVectorOrLinearParamStore(Function *Clone, + std::vector &ParmKinds, + Instruction *Inst); + + /// \brief Removes the original scalar alloca instructions that correspond + /// to a vector parameter before widening. + void removeScalarAllocasForVectorParams( + std::vector &VectorParmMap); + + /// \brief Adds metadata to the conditional branch of the simd loop latch to + /// prevent loop unrolling. + void disableLoopUnrolling(BasicBlock *Latch); + + /// \brief Check to see that the type of the gep used for a load instruction + /// is compatible with the type needed as the result of the load. Basically, + /// check the validity of the LLVM IR to make sure that proper pointer + /// dereferencing is done. + bool typesAreCompatibleForLoad(Type *GepType, Type *LoadType); + + bool runOnModule(Module &M) override; + + public: + + static char ID; + VecClone(); + void print(raw_ostream &OS, const Module * = nullptr) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +}; // end pass class + +ModulePass *createVecClonePass(); + +} // end llvm namespace + +#endif // LLVM_TRANSFORMS_VPO_VECCLONE_H Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -84,6 +84,7 @@ ValueLatticeUtils.cpp ValueTracking.cpp VectorUtils.cpp + VectorVariant.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -584,6 +584,26 @@ return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +unsigned TargetTransformInfo::maximumSizeofISAClassVectorRegister( + ISAClass I, Type *Ty) const { + + return TTIImpl->maximumSizeofISAClassVectorRegister(I, Ty); +} + +char TargetTransformInfo::encodeISAClass(ISAClass IsaClass) const { + return TTIImpl->encodeISAClass(IsaClass); +} + +TargetTransformInfo::ISAClass TargetTransformInfo::decodeISAClass( + char IsaClass) const { + return TTIImpl->decodeISAClass(IsaClass); +} + +Type* TargetTransformInfo::promoteToSupportedType(Type *Ty, + ISAClass IsaClass) const { + return TTIImpl->promoteToSupportedType(Ty, IsaClass); +} + bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); Index: lib/Analysis/VectorVariant.cpp =================================================================== --- lib/Analysis/VectorVariant.cpp +++ lib/Analysis/VectorVariant.cpp @@ -0,0 +1,112 @@ +//===---------- VectorVariant.cpp - Vector function ABI -*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the VectorVariant class and corresponding utilities. +/// VectorVariant objects are associated with a scalar function and are used +/// to generate new functions that can be vectorized. VectorVariants are +/// determined by inspecting the function attributes associated with the scalar +/// function. When a mangled function name is found in the attributes (indicated +/// as "_ZGV"), a VectorVariant object is created. The class and utilities +/// in this file follow the standards defined in the vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/Type.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +/// \brief Generate a vector variant by decoding the mangled string for the +/// variant contained in the original scalar function's attributes. For +/// example: "_ZGVxN4". The name mangling is defined in the vector function +/// ABI. Based on this string, the parameter kinds (uniform, linear, vector), +/// vector length, parameter alignment, and masking are determined. +VectorVariant::VectorVariant(StringRef FuncName, + TargetTransformInfo *TTI) : TTI(TTI) { + + assert(isVectorVariant(FuncName) && "invalid vector variant format"); + + std::stringstream SST(FuncName.drop_front(prefix().size())); + + // mandatory annotations + char EncodedISA; + SST.get(EncodedISA); + Isa = TTI->decodeISAClass(EncodedISA); + + char EncodedMask; + SST.get(EncodedMask); + Mask = decodeMask(EncodedMask); + SST >> Vlen; + + // optional parameter annotations + while (SST.peek() != '_') { + + char Kind; + int Stride = VectorKind::notAValue(); + int StrideSign = POSITIVE; + int Alignment = NOT_ALIGNED; + + // Get parameter kind + SST.get(Kind); + + // Default stride for linear is 1. If the stride for a parameter is 1, + // then the front-end will not encode it and we will not have set the + // correct stride below. + if (Kind == LINEAR_KIND) + Stride = 1; + + // Handle optional stride + if (SST.peek() == 'n') { + // Stride is negative + SST.ignore(1); + StrideSign = NEGATIVE; + } + + if (std::isdigit(SST.peek())) { + // Extract constant stride + SST >> Stride; + assert((Kind != STRIDE_KIND || Stride >= 0) && + "variable stride argument index cannot be negative"); + } + + Stride *= StrideSign; + // Handle optional alignment + if (SST.peek() == 'a') { + SST.ignore(1); + SST >> Alignment; + } + + VectorKind VecKind(Kind, Stride, Alignment); + Parameters.push_back(VecKind); + } + + if (Mask) { + // Masked variants will have an additional mask parameter + VectorKind VecKind(VECTOR_KIND, VectorKind::notAValue()); + Parameters.push_back(VecKind); + } +} + +/// \brief Determine the vector variant's vector length based on the +/// characteristic data type defined in the vector function ABI and target +/// vector register width. +unsigned int VectorVariant::calcVlen(TargetTransformInfo::ISAClass I, + Type* CharacteristicDataType) { + assert(CharacteristicDataType && + CharacteristicDataType->getPrimitiveSizeInBits() != 0 && + "expected characteristic data type to have a primitive size in bits"); + + unsigned int VectorRegisterSize = + TTI->maximumSizeofISAClassVectorRegister(I, CharacteristicDataType); + + return VectorRegisterSize / CharacteristicDataType->getPrimitiveSizeInBits(); +} Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -129,6 +129,12 @@ const Function *Callee) const; bool enableMemCmpExpansion(unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); + + unsigned maximumSizeofISAClassVectorRegister(TTI::ISAClass IsaClass, + Type *Ty) const; + char encodeISAClass(TTI::ISAClass IsaClass) const; + TargetTransformInfo::ISAClass decodeISAClass(char IsaClass) const; + Type* promoteToSupportedType(Type *Ty, TTI::ISAClass IsaClass) const; private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2808,3 +2808,83 @@ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } + +unsigned X86TTIImpl::maximumSizeofISAClassVectorRegister(TTI::ISAClass I, + Type *Ty) const { + + assert((Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() || + Ty->isPointerTy()) && + "unsupported type"); + + unsigned int VectorRegisterSize = 0; + + switch (I) { + case TTI::XMM: + VectorRegisterSize = 128; + break; + case TTI::YMM1: + if (Ty->isIntegerTy() || Ty->isPointerTy()) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case TTI::YMM2: + if (Ty->isIntegerTy(8)) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case TTI::ZMM: + VectorRegisterSize = 512; + break; + default: + llvm_unreachable("unknown isa class"); + return 0; + } + + assert(VectorRegisterSize != 0 && "unsupported ISA/type combination"); + return VectorRegisterSize; +} + +char X86TTIImpl::encodeISAClass(TTI::ISAClass IsaClass) const { + switch (IsaClass) { + case TTI::XMM: + return 'b'; + case TTI::YMM1: + return 'c'; + case TTI::YMM2: + return 'd'; + case TTI::ZMM: + return 'e'; + default: + break; + } + + assert(false && "unsupported ISA class"); + return '?'; +} + +TargetTransformInfo::ISAClass X86TTIImpl::decodeISAClass(char IsaClass) const { + switch (IsaClass) { + case 'b': + return TTI::XMM; + case 'c': + return TTI::YMM1; + case 'd': + return TTI::YMM2; + case 'e': + return TTI::ZMM; + default: + llvm_unreachable("unsupported ISA class"); + return TTI::XMM; + } +} + +Type* X86TTIImpl::promoteToSupportedType(Type *Ty, TTI::ISAClass I) const { + // On ZMM promote char and short to int + if (I == TargetTransformInfo::ISAClass::ZMM && (Ty->isIntegerTy(8) || + Ty->isIntegerTy(16))) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Utils/VecClone.h" using namespace llvm; @@ -94,6 +95,9 @@ "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); +static cl::opt RunVecClone("enable-vec-clone", cl::init(false), cl::Hidden, + cl::desc("Run Vector Function Cloning")); + static cl::opt EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -426,6 +430,10 @@ // new unnamed globals. if (PrepareForThinLTO) MPM.add(createNameAnonGlobalPass()); + + if (RunVecClone) + MPM.add(createVecClonePass()); + return; } @@ -588,6 +596,9 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); + if (RunVecClone) + MPM.add(createVecClonePass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -52,6 +52,7 @@ UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp + VecClone.cpp VNCoercion.cpp ADDITIONAL_HEADER_DIRS Index: lib/Transforms/Utils/VecClone.cpp =================================================================== --- lib/Transforms/Utils/VecClone.cpp +++ lib/Transforms/Utils/VecClone.cpp @@ -0,0 +1,1727 @@ +//=------- VecClone.cpp - Vector function to loop transform -*- C++ -*-------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This pass inserts the body of a vector function inside a vector length +/// trip count scalar loop for functions that are declared SIMD. The pass +/// currently follows the gcc vector ABI requirements for name mangling +/// encodings, but will be extended in the future to also support the Intel +/// vector ABI. References to both ABIs can be found here: +/// +/// https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt +/// https://software.intel.com/sites/default/files/managed/b4/c8/Intel-Vector-Function-ABI.pdf +/// +/// Conceptually, this pass performs the following transformation: +/// +/// Before Translation: +/// +/// main.cpp +/// +/// #pragma omp declare simd uniform(a) linear(k) +/// extern float dowork(float *a, float b, int k); +/// +/// float a[4096]; +/// float b[4096]; +/// int main() { +/// int k; +/// for (k = 0; k < 4096; k++) { +/// b[k] = k; +/// } +/// #pragma clang loop vectorize(enable) +/// for (k = 0; k < 4096; k++) { +/// a[k] = k * 0.5; +/// a[k] = dowork(a, b[k], k); +/// } +/// } +/// +/// dowork.cpp +/// +/// #pragma omp declare simd uniform(a) linear(k) #0 +/// float dowork(float *a, float b, int k) { +/// return sinf(a[k]) + b; +/// } +/// +/// attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4uvl_", +/// "ZGVbN4uvl_", ... } +/// +/// After Translation: +/// +/// dowork.cpp +/// +/// // Non-masked variant +/// +/// "_ZGVbN4uvl_dowork(float *a, b, int k) { +/// alloc vec_ret; +/// alloc vec_b; +/// // casts from vector to scalar pointer allows loop to be in a scalar form +/// // that can be vectorized easily. +/// ret_cast = bitcast * vec_ret to float*; +/// vec_b_cast = bitcast * vec_b to float*; +/// store b, * vec_b; +/// for (int i = 0; i < VL; i++, k++) { +/// ret_cast[i] = sinf(a[k]) + vec_b_cast[i]; +/// } +/// return vec_ret; +/// } +/// +/// // Masked variant +/// +/// "_ZGVbM4uvl_dowork(float *a, b, int k, +/// mask) { +/// alloc vec_ret; +/// alloc vec_b; +/// ret_cast = bitcast * vec_ret to float*; +/// vec_b_cast = bitcast * vec_b to float*; +/// store b, * vec_b; +/// for (int i = 0; i < VL; i++, k++) { +/// if (mask[i] != 0) +/// ret_cast[i] = sinf(a[k]) + vec_b_cast[i]; +/// } +/// return vec_ret; +/// } +/// +// ===--------------------------------------------------------------------=== // + +// This pass is flexible enough to recognize whether or not parameters have been +// registerized so that the users of the parameter can be properly updated. For +// instance, we need to know where the users of linear parameters are so that +// the stride can be added to them. +// +// In the following example, %i and %x are used directly by %add directly, so +// in this case the pass can just look for users of %i and %x. +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %add = add nsw i32 %x, %i +// ret i32 %add +// } +// +// When parameters have not been registerized, parameters are used indirectly +// through a store/load of the parameter to/from memory that has been allocated +// for them in the function. Thus, in this case, the pass looks for users of +// %0 and %1. +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %i.addr = alloca i32, align 4 +// %x.addr = alloca i32, align 4 +// store i32 %i, i32* %i.addr, align 4 +// store i32 %x, i32* %x.addr, align 4 +// %0 = load i32, i32* %x.addr, align 4 +// %1 = load i32, i32* %i.addr, align 4 +// %add = add nsw i32 %0, %1 +// ret i32 %add +// } +// +// The pass must run at all optimization levels because it is possible that +// a loop calling the vector function is vectorized, but the vector function +// itself is not vectorized. For example, above main.cpp may be compiled at +// -O2, but dowork.cpp may be compiled at -O0. Therefore, it is required that +// the attribute list for the vector function specify all variants that must +// be generated by this pass so as to avoid any linking problems. This pass +// also serves to canonicalize the input IR to the loop vectorizer. + +#include "llvm/Transforms/Utils/VecClone.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include + +#define SV_NAME "vec-clone" +#define DEBUG_TYPE "VecClone" + +using namespace llvm; + +VecClone::VecClone() : ModulePass(ID) {} + +void VecClone::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + +Type *VecClone::calcCharacteristicType(Function &F, VectorVariant &Variant) { + Type *ReturnType = F.getReturnType(); + Type *CharacteristicDataType = nullptr; + + if (!ReturnType->isVoidTy()) + CharacteristicDataType = ReturnType; + + if (!CharacteristicDataType) { + + std::vector &ParmKinds = Variant.getParameters(); + Function::const_arg_iterator ArgIt = F.arg_begin(); + Function::const_arg_iterator ArgEnd = F.arg_end(); + std::vector::iterator VKIt = ParmKinds.begin(); + + for (; ArgIt != ArgEnd; ++ArgIt, ++VKIt) { + if (VKIt->isVector()) { + CharacteristicDataType = (*ArgIt).getType(); + break; + } + } + } + + // TODO except Clang's ComplexType + if (!CharacteristicDataType || CharacteristicDataType->isStructTy()) { + CharacteristicDataType = Type::getInt32Ty(F.getContext()); + } + + // Legalize the characteristic type based on target requirements. + CharacteristicDataType = + Variant.promoteToSupportedType(CharacteristicDataType); + + if (CharacteristicDataType->isPointerTy()) { + // For such cases as 'int* foo(int x)', where x is a non-vector type, the + // characteristic type at this point will be i32*. If we use the DataLayout + // to query the supported pointer size, then a promotion to i64* is + // incorrect because the mask element type will mismatch the element type + // of the characteristic type. + PointerType *PointerTy = cast(CharacteristicDataType); + CharacteristicDataType = PointerTy->getElementType(); + } + + return CharacteristicDataType; +} + +void VecClone::getFunctionsToVectorize( + llvm::Module &M, std::map> &FuncVars) { + + // FuncVars will contain a 1-many mapping between the original scalar + // function and the vector variant encoding strings (represented as + // attributes). The encodings correspond to functions that will be created by + // the caller of this function as vector versions of the original function. + // For example, if foo() is a function marked as a simd function, it will have + // several vector variant encodings like: "_ZGVbM4_foo", "_ZGVbN4_foo", + // "_ZGVcM8_foo", "_ZGVcN8_foo", "_ZGVdM8_foo", "_ZGVdN8_foo", "_ZGVeM16_foo", + // "_ZGVeN16_foo". The caller of this function will then clone foo() and name + // the clones using the above name manglings. The variant encodings correspond + // to differences in masked/non-masked execution, vector length, and target + // vector register size, etc. For more details, please refer to the following + // reference for details on the vector function encodings. + // https://www.cilkplus.org/sites/default/files/open_specifications/ + // Intel-ABI-Vector-Function-2012-v0.9.5.pdf + + for (auto It = M.begin(), End = M.end(); It != End; ++It) { + Function &F = *It; + if (F.hasFnAttribute("vector-variants")) { + Attribute Attr = F.getFnAttribute("vector-variants"); + StringRef VariantsStr = Attr.getValueAsString(); + SmallVector Variants; + VariantsStr.split(Variants, ','); + for (unsigned i = 0; i < Variants.size(); i++) { + FuncVars[&F].push_back(Variants[i]); + } + } + } +} + +template Constant * +VecClone::getConstantValue(Type *Ty, LLVMContext &Context, int Val); +template Constant * +VecClone::getConstantValue(Type *Ty, LLVMContext &Context, float Val); +template Constant * +VecClone::getConstantValue(Type *Ty, LLVMContext &Context, double Val); +template +Constant *VecClone::getConstantValue(Type *Ty, LLVMContext &Context, T Val) { + Constant *ConstVal = nullptr; + + if (Ty->isIntegerTy()) { + ConstVal = ConstantInt::get(Ty, Val); + } else if (Ty->isFloatTy()) { + ConstVal = ConstantFP::get(Ty, Val); + } + + assert(ConstVal && "Could not generate constant for type"); + + return ConstVal; +} + +void VecClone::insertInstruction(Instruction *Inst, BasicBlock *BB) { + // This function inserts instructions in a way that groups like instructions + // together for debuggability/readability purposes. This was designed to make + // the entry basic block easier to read since this pass creates/modifies + // alloca, store, and bitcast instructions for each vector parameter and + // return. Thus, this function ensures all allocas are grouped together, all + // stores are grouped together, and so on. If the type of instruction passed + // in does not exist in the basic block, then it is added to the end of the + // basic block, just before the terminator instruction. + + BasicBlock::reverse_iterator BBIt = BB->rbegin(); + BasicBlock::reverse_iterator BBEnd = BB->rend(); + BasicBlock::iterator AnchorInstIt = BB->end(); + AnchorInstIt--; + Instruction *Anchor = &*AnchorInstIt; + + for (; BBIt != BBEnd; ++BBIt) { + if (Inst->getOpcode() == (&*BBIt)->getOpcode()) { + Anchor = &*BBIt; + break; + } + } + + if (isa(Anchor)) { + Inst->insertBefore(Anchor); + } else { + Inst->insertAfter(Anchor); + } +} + +bool VecClone::hasComplexType(Function *F) { + Function::arg_iterator ArgListIt = F->arg_begin(); + Function::arg_iterator ArgListEnd = F->arg_end(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + // Complex types for parameters/return come in as vector. + if (ArgListIt->getType()->isVectorTy()) { + return true; + } + } + + return false; +} + +Function *VecClone::CloneFunction(Function &F, VectorVariant &V) { + + DEBUG(dbgs() << "Cloning Function: " << F.getName() << "\n"); + DEBUG(F.dump()); + + FunctionType *OrigFunctionType = F.getFunctionType(); + Type *ReturnType = F.getReturnType(); + Type *CharacteristicType = calcCharacteristicType(F, V); + + // Expand return type to vector. + if (!ReturnType->isVoidTy()) + ReturnType = VectorType::get(ReturnType, V.getVlen()); + + std::vector ParmKinds = V.getParameters(); + SmallVector ParmTypes; + FunctionType::param_iterator ParmIt = OrigFunctionType->param_begin(); + FunctionType::param_iterator ParmEnd = OrigFunctionType->param_end(); + std::vector::iterator VKIt = ParmKinds.begin(); + for (; ParmIt != ParmEnd; ++ParmIt, ++VKIt) { + if (VKIt->isVector()) + ParmTypes.push_back( + VectorType::get((*ParmIt)->getScalarType(), V.getVlen())); + else + ParmTypes.push_back(*ParmIt); + } + + if (V.isMasked()) { + Type *MaskVecTy = VectorType::get(CharacteristicType, V.getVlen()); + ParmTypes.push_back(MaskVecTy); + } + + FunctionType *CloneFuncType = FunctionType::get(ReturnType, ParmTypes, false); + + std::string VariantName = V.generateFunctionName(F.getName()); + Function *Clone = Function::Create( + CloneFuncType, GlobalValue::ExternalLinkage, VariantName, F.getParent()); + + // Remove vector variant attributes from the original function. They are + // not needed for the cloned function and it prevents any attempts at + // trying to clone the function again in case the pass is called more than + // once. + F.removeFnAttr("vector-variants"); + + Function::arg_iterator ArgIt = F.arg_begin(); + Function::arg_iterator ArgEnd = F.arg_end(); + ValueToValueMapTy Vmap; + Function::arg_iterator NewArgIt = Clone->arg_begin(); + for (; ArgIt != ArgEnd; ++ArgIt, ++NewArgIt) { + NewArgIt->setName(ArgIt->getName()); + Vmap[&*ArgIt] = &*NewArgIt; + } + + if (V.isMasked()) { + Argument &MaskArg = *NewArgIt; + MaskArg.setName("mask"); + } + + SmallVector Returns; + CloneFunctionInto(Clone, &F, Vmap, true, Returns); + + // Remove incompatible argument attributes (applied to the scalar argument, + // does not apply to its vector counterpart). This must be done after cloning + // the function because CloneFunctionInto() transfers parameter attributes + // from the original parameters in the Vmap. + ArgIt = Clone->arg_begin(); + ArgEnd = Clone->arg_end(); + AttrBuilder AB; + for (uint64_t Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) { + Type *ArgType = (*ArgIt).getType(); + AB = AttributeFuncs::typeIncompatible(ArgType); + Clone->removeParamAttrs(Idx, AB); + } + + AB = AttributeFuncs::typeIncompatible(ReturnType); + Clone->removeAttributes(AttributeList::ReturnIndex, AB); + + DEBUG(dbgs() << "After Cloning and Function Signature expansion\n"); + DEBUG(Clone->dump()); + + return Clone; +} + +bool VecClone::isVectorOrLinearParamStore(Function *Clone, + std::vector &ParmKinds, + Instruction *Inst) { + if (StoreInst *Store = dyn_cast(Inst)) { + Value *Op0 = Store->getOperand(0); + Function::arg_iterator ArgListIt = Clone->arg_begin(); + Function::arg_iterator ArgListEnd = Clone->arg_end(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + unsigned ParmIdx = ArgListIt->getArgNo(); + if (&*ArgListIt == Op0 && + (ParmKinds[ParmIdx].isVector() || ParmKinds[ParmIdx].isLinear())) { + return true; + } + } + } + + return false; +} + +BasicBlock *VecClone::splitEntryIntoLoop(Function *Clone, VectorVariant &V, + BasicBlock *EntryBlock) { + + // EntryInsts contains all instructions that need to stay in the entry basic + // block. These instructions include allocas and stores involving vector and + // linear parameters to alloca. Linear parameter stores to alloca are kept in + // the entry block because there will be a load from this alloca in the loop + // for which we will apply the stride. Instructions involving uniform + // parameter stores to alloca should be sunk into the loop to maintain + // uniform behavior. All instructions involving private variables are also + // sunk into the loop. + + SmallVector EntryInsts; + std::vector ParmKinds = V.getParameters(); + BasicBlock::iterator BBIt = EntryBlock->begin(); + BasicBlock::iterator BBEnd = EntryBlock->end(); + + for (; BBIt != BBEnd; ++BBIt) { + if (isa(BBIt) || + isVectorOrLinearParamStore(Clone, ParmKinds, &*BBIt)) { + // If this is a store of a vector parameter, keep it in the entry block + // because it will be modified with the vector alloca reference. Since the + // parameter has already been expanded, this becomes a vector store (i.e., + // packing instruction) that we do not want to appear in the scalar loop. + // It is correct to leave linear parameter stores in the entry or move + // them to the scalar loop, but leaving them in the entry block prevents + // an additional store inside the loop. Uniform parameter stores must be + // moved to the loop body to behave as uniform. Consider the following: + // + // __declspec(vector(uniform(x))) + // int foo(int a, int x) { + // x++; + // return (a + x); + // } + // + // Assume x = 1 for the call to foo. This implies x = 2 for the vector + // add. e.g., a[0:VL-1] + <2, 2, 2, 2>. If the initial store of x to the + // stack is done in the entry block outside of the loop, then x will be + // incremented by one each time within the loop because the increment of + // x will reside in the loop. Therefore, if the store of x is sunk into + // the loop, the initial value of 1 will always be stored to a temp + // before the increment, resulting in the value of 2 always being computed + // in the scalar loop. + EntryInsts.push_back(&*BBIt); + } + } + + BasicBlock *LoopBlock = + EntryBlock->splitBasicBlock(EntryBlock->begin(), "simd.loop"); + + for (auto *Inst : EntryInsts) { + Inst->removeFromParent(); + Inst->insertBefore(EntryBlock->getTerminator()); + } + + DEBUG(dbgs() << "After Entry Block Split\n"); + DEBUG(Clone->dump()); + + return LoopBlock; +} + +BasicBlock *VecClone::splitLoopIntoReturn(Function *Clone, + BasicBlock *LoopBlock) { + + // Determine the basic block with the return. For simple cases, the 'ret' + // instruction will be part of the entry block. In this case, separate the + // 'ret' into a new basic block because we don't want this as part of the + // loop. For more complex cases, the 'ret' and corresponding instructions + // (i.e., load from auto variable) will already be in a separate basic block, + // so no need to split here. + + Instruction *SplitPt = LoopBlock->getTerminator(); + + if (ReturnInst *Return = dyn_cast(SplitPt)) { + + // If the return is from a preceeding load, make sure the load is also put + // in the return block. This is the old scalar load that will end up getting + // replaced with the vector return and will get cleaned up later. + + // Make sure this is not a void function before getting the return + // operand. + if (!Clone->getReturnType()->isVoidTy()) { + Value *RetOp = Return->getOperand(0); + Value::use_iterator UseIt = RetOp->use_begin(); + Value::use_iterator UseEnd = RetOp->use_end(); + + for (; UseIt != UseEnd; ++UseIt) { + LoadInst *RetLoad = dyn_cast(*UseIt); + if (RetLoad) { + SplitPt = RetLoad; + } + } + } + } + + Function::iterator ReturnBlockIt = Clone->end(); + BasicBlock *ReturnBlock; + if (dyn_cast(SplitPt) || dyn_cast(SplitPt)) { + ReturnBlock = LoopBlock->splitBasicBlock(SplitPt, "return"); + } else { + ReturnBlockIt = Clone->end(); + ReturnBlockIt--; + ReturnBlock = &*ReturnBlockIt; + } + + return ReturnBlock; +} + +void VecClone::updateReturnPredecessors(Function *Clone, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock) { + // Update the branches of the ReturnBlock predecessors to point back to + // LoopBlock if the index is less than VL. + + // First, collect the basic blocks to be updated since we don't want to update + // the CFG while iterating through it. + SmallVector BranchesToUpdate; + Function::iterator FI = Clone->begin(); + Function::iterator FE = Clone->end(); + for (; FI != FE; ++FI) { + + BasicBlock::iterator BBI = FI->begin(); + BasicBlock::iterator BBE = FI->end(); + + for (; BBI != BBE; ++BBI) { + + BranchInst *Branch = dyn_cast(BBI); + + if (Branch) { + unsigned NumSucc = Branch->getNumSuccessors(); + + for (unsigned I = 0; I < NumSucc; ++I) { + if (Branch->getSuccessor(I) == ReturnBlock) { + BranchesToUpdate.push_back(Branch); + } + } + } + } + } + + // Now, do the actual update. The code below handles both conditional and + // unconditional branches because we loop through all successors of the + // branch to see if any of them point to the ReturnBlock. + for (unsigned I = 0; I < BranchesToUpdate.size(); ++I) { + unsigned int NumOps = BranchesToUpdate[I]->getNumSuccessors(); + for (unsigned Idx = 0; Idx < NumOps; ++Idx) { + BasicBlock *Successor = BranchesToUpdate[I]->getSuccessor(Idx); + if (Successor == ReturnBlock) { + BranchesToUpdate[I]->setOperand(Idx, LoopExitBlock); + } + } + } +} + +BasicBlock *VecClone::createLoopExit(Function *Clone, BasicBlock *ReturnBlock) { + BasicBlock *LoopExitBlock = BasicBlock::Create( + Clone->getContext(), "simd.loop.exit", Clone, ReturnBlock); + + updateReturnPredecessors(Clone, LoopExitBlock, ReturnBlock); + return LoopExitBlock; +} + +PHINode *VecClone::createPhiAndBackedgeForLoop( + Function *Clone, BasicBlock *EntryBlock, BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, BasicBlock *ReturnBlock, int VectorLength) { + + // Create the phi node for the top of the loop block and add the back + // edge to the loop from the loop exit. + + PHINode *Phi = PHINode::Create(Type::getInt32Ty(Clone->getContext()), 2, + "index", &*LoopBlock->getFirstInsertionPt()); + + Constant *Inc = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 1); + Constant *IndInit = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 0); + + Instruction *Induction = + BinaryOperator::CreateNUWAdd(Phi, Inc, "indvar", LoopExitBlock); + + Constant *VL = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), VectorLength); + + Instruction *VLCmp = + new ICmpInst(*LoopExitBlock, CmpInst::ICMP_ULT, Induction, VL, "vl.cond"); + + BranchInst::Create(LoopBlock, ReturnBlock, VLCmp, LoopExitBlock); + + Phi->addIncoming(IndInit, EntryBlock); + Phi->addIncoming(Induction, LoopExitBlock); + + DEBUG(dbgs() << "After Loop Insertion\n"); + DEBUG(Clone->dump()); + + return Phi; +} + +Instruction * +VecClone::expandVectorParameters(Function *Clone, VectorVariant &V, + BasicBlock *EntryBlock, + std::vector &VectorParmMap) { + // For vector parameters, expand the existing alloca (if there is one) to a + // vector. If there isn't one, create a VL-sized alloca for it. Then, bitcast + // the vector and store this instruction in a map. The map is later used to + // insert the new instructions and to replace the old scalar memory + // references. If there are no parameters, then the function simply does not + // perform any expansion since we iterate over the function's arg list. We + // must always have a vector alloca for vector parameters so that we can + // bitcast them to a scalar pointer that can be loaded from using the loop + // index. + + Instruction *Mask = nullptr; + SmallVector StoresToInsert; + + Function::arg_iterator ArgIt = Clone->arg_begin(); + Function::arg_iterator ArgEnd = Clone->arg_end(); + + unsigned LastArg = Clone->arg_size() - 1; + unsigned ArgIdx = 0; + + for (; ArgIt != ArgEnd; ++ArgIt) { + + User::user_iterator UserIt = ArgIt->user_begin(); + User::user_iterator UserEnd = ArgIt->user_end(); + + VectorType *VecType = dyn_cast(ArgIt->getType()); + + if (VecType) { + + // Some args other than the mask may not have users, but have not been + // removed as dead. In those cases, just go on to the next argument. + // There's no need to expand non-mask parameters with no users. + bool MaskArg = V.isMasked() && ArgIdx == LastArg; + + if (!(!MaskArg && ArgIt->getNumUses() == 0)) { + + // Create a new vector alloca and bitcast to a pointer to the element + // type. The following is an example of what the cast should look like: + // + // %veccast = bitcast <2 x i32>* %vec_a.addr to i32* + // + // geps using the bitcast will appear in a scalar form instead of + // casting to an array or using vector. For example, + // + // %vecgep1 = getelementptr i32, i32* %veccast, i32 %index + // + // instead of: + // + // getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 1 + // + // We do this to put the geps in a more scalar form. + + const DataLayout &DL = Clone->getParent()->getDataLayout(); + AllocaInst *VecAlloca = new AllocaInst(VecType, DL.getAllocaAddrSpace(), + "vec." + ArgIt->getName()); + insertInstruction(VecAlloca, EntryBlock); + PointerType *ElemTypePtr = PointerType::get( + VecType->getElementType(), VecAlloca->getType()->getAddressSpace()); + + BitCastInst *VecParmCast = nullptr; + if (MaskArg) { + Mask = new BitCastInst(VecAlloca, ElemTypePtr, "mask.cast"); + } else { + VecParmCast = new BitCastInst(VecAlloca, ElemTypePtr, + "vec." + ArgIt->getName() + ".cast"); + insertInstruction(VecParmCast, EntryBlock); + } + + StoreInst *StoreUser = nullptr; + AllocaInst *Alloca = nullptr; + ParmRef *PRef = new ParmRef(); + + for (; UserIt != UserEnd; ++UserIt) { + + StoreUser = dyn_cast(*UserIt); + + if (StoreUser) { + // For non-mask parameters, find the initial store of the parameter + // to an alloca instruction. Map this alloca to the vector bitcast + // created above so that we can update the old scalar references. + Alloca = dyn_cast(UserIt->getOperand(1)); + PRef->VectorParm = Alloca; + break; + } + } + + if (!Alloca && !Mask) { + // Since Mem2Reg has run, there is no existing scalar store for + // the parameter, but we must still pack (store) the expanded vector + // parameter to a new vector alloca. This store is created here and + // put in a container for later insertion. We cannot insert it here + // since this will be a new user of the parameter and we are still + // iterating over the original users of the parameter. This will + // invalidate the iterator. We also map the parameter directly to the + // vector bitcast so that we can later update any users of the + // parameter. + Value *ArgValue = dyn_cast(ArgIt); + StoreInst *Store = new StoreInst(ArgValue, VecAlloca); + StoresToInsert.push_back(Store); + PRef->VectorParm = ArgValue; + } + + if (!Mask) { + // Mapping not needed for the mask parameter because there will + // be no users of it to replace. This parameter will only be used to + // introduce if conditions on each mask bit. + PRef->VectorParmCast = VecParmCast; + VectorParmMap.push_back(PRef); + } + } + } + + ArgIdx++; + } + + // Insert any necessary vector parameter stores here. This is needed for when + // there were no existing scalar stores that we can update to vector stores + // for the parameter. This is needed when Mem2Reg has registerized parameters. + // The stores are inserted after the allocas in the entry block. + for (auto *Inst : StoresToInsert) { + insertInstruction(Inst, EntryBlock); + } + + return Mask; +} + +Instruction *VecClone::createExpandedReturn(Function *Clone, + BasicBlock *EntryBlock, + VectorType *ReturnType) { + // Expand the return temp to a vector. + + VectorType *AllocaType = dyn_cast(Clone->getReturnType()); + + const DataLayout &DL = Clone->getParent()->getDataLayout(); + AllocaInst *VecAlloca = + new AllocaInst(AllocaType, DL.getAllocaAddrSpace(), "vec.retval"); + insertInstruction(VecAlloca, EntryBlock); + PointerType *ElemTypePtr = PointerType::get( + ReturnType->getElementType(), VecAlloca->getType()->getAddressSpace()); + + BitCastInst *VecCast = new BitCastInst(VecAlloca, ElemTypePtr, "ret.cast"); + insertInstruction(VecCast, EntryBlock); + + return VecCast; +} + +Instruction *VecClone::expandReturn(Function *Clone, BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *ReturnBlock, + std::vector &VectorParmMap) { + // Determine how the return is currently handled, since this will determine + // if a new vector alloca is required for it. For simple functions, an alloca + // may not have been created for the return value. The function may just + // simply return a value defined by some operation that now exists within the + // loop. If an alloca was generated already, then the return block will load + // from it and then return. Thus, we look for a return resulting from a load + // in the return block. If found, we have already expanded all alloca + // instructions to vector types and the old scalar references have already + // been replaced with them. In this case, we only need to pack the results + // from the vector alloca into a temp and return the temp. If a vector alloca + // was not generated for the return, we need to add one for it because we have + // a scalar reference in the loop that needs to be replaced. After creating + // the new vector alloca, replace the reference to it in the loop and then + // pack the results into a temp and return it. + // + // Example 1: // alloca not generated in entry block + // + // loop: + // ... // some set of instructions + // %add1 = add nsw i32 %1, %2 + // br label %loop.exit (loop exit contains br to return block) + // + // return: + // ret i32 %add1 + // + // + // Example 2: + // + // loop: + // ... // some set of instructions + // %vecgep1 = getelementptr <2 x i32>* %vec_ret, i32 0, i32 %index + // store i32 %add2, i32* %vecgep1 + // br label %loop.exit (loop exit contains br to return block) + // + // return: + // %7 = load i32, i32*, %retval // the original scalar alloca + // ret i32 %7 + // + + ReturnInst *FuncReturn = dyn_cast(ReturnBlock->getTerminator()); + assert(FuncReturn && "Expected ret instruction to terminate the return\ + basic block"); + + LoadInst *LoadFromAlloca = dyn_cast(FuncReturn->getOperand(0)); + + // We need to generate a vector alloca for the return vector. + // Two cases exist, here: + // + // 1) For simple functions, the return is a temp defined within the + // loop body and the temp is not loaded from an alloca, or the return is + // a constant. (obviously, also not loaded from an alloca) + // + // 2) The return temp traces back to an alloca. + // + // For both cases, generate a vector alloca so that we can later load from it + // and return the vector temp from the function. The alloca is used to load + // and store from so that the scalar loop contains load/store/gep + // instructions. This enables AVR construction to remain straightforward. + // E.g., we don't need to worry about figuring out how to represent + // insert/extract when building AVR nodes. This keeps consistent with how ICC + // is operating. + // + // Additionally, for case 1 we must generate a gep and store after the + // instruction that defines the original return temp, so that we can store + // the result into the proper index of the return vector. For case 2, we must + // go into the loop and replace the old scalar alloca reference with the one + // just created as vector. + + Instruction *VecReturn = NULL; + VectorType *ReturnType = dyn_cast(Clone->getReturnType()); + + if (!LoadFromAlloca) { + + // Case 1 + + VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType); + Value *RetVal = FuncReturn->getReturnValue(); + Instruction *RetFromTemp = dyn_cast(RetVal); + + Instruction *InsertPt; + Value *ValToStore; + Instruction *Phi = &*LoopBlock->begin(); + + if (RetFromTemp) { + // If we're returning from an SSA temp, set the insert point to the + // definition of the temp. + InsertPt = RetFromTemp; + ValToStore = RetFromTemp; + } else { + // If we're returning a constant, then set the insert point to the loop + // phi. From here, a store to the vector using the constant is inserted. + InsertPt = Phi; + ValToStore = RetVal; + } + + // Generate a gep from the bitcast of the vector alloca used for the return + // vector. + GetElementPtrInst *VecGep = + GetElementPtrInst::Create(ReturnType->getElementType(), VecReturn, Phi, + VecReturn->getName() + ".gep"); + VecGep->insertAfter(InsertPt); + + // Store the constant or temp to the appropriate lane in the return vector. + StoreInst *VecStore = new StoreInst(ValToStore, VecGep); + VecStore->insertAfter(VecGep); + + } else { + + // Case 2 + + AllocaInst *Alloca = dyn_cast(LoadFromAlloca->getOperand(0)); + bool AllocaFound = false; + unsigned ParmIdx = 0; + + for (; ParmIdx < VectorParmMap.size(); ParmIdx++) { + Value *ParmVal = VectorParmMap[ParmIdx]->VectorParm; + if (ParmVal == Alloca) + AllocaFound = true; + } + + if (AllocaFound) { + // There's already a vector alloca created for the return, which is the + // same one used for the parameter. E.g., we're returning the updated + // parameter. + VecReturn = VectorParmMap[ParmIdx]->VectorParmCast; + } else { + // A new return vector is needed because we do not load the return value + // from an alloca. + VecReturn = createExpandedReturn(Clone, EntryBlock, ReturnType); + ParmRef *PRef = new ParmRef(); + PRef->VectorParm = Alloca; + PRef->VectorParmCast = VecReturn; + VectorParmMap.push_back(PRef); + } + } + + return VecReturn; +} + +Instruction *VecClone::expandVectorParametersAndReturn( + Function *Clone, VectorVariant &V, Instruction **Mask, + BasicBlock *EntryBlock, BasicBlock *LoopBlock, BasicBlock *ReturnBlock, + std::vector &VectorParmMap) { + // If there are no parameters, then this function will do nothing and this + // is the expected behavior. + *Mask = expandVectorParameters(Clone, V, EntryBlock, VectorParmMap); + + // If the function returns void, then don't attempt to expand to vector. + Instruction *ExpandedReturn = ReturnBlock->getTerminator(); + if (!Clone->getReturnType()->isVoidTy()) { + ExpandedReturn = + expandReturn(Clone, EntryBlock, LoopBlock, ReturnBlock, VectorParmMap); + } + + // So, essentially what has been done to this point is the creation and + // insertion of the vector alloca instructions. Now, we insert the bitcasts of + // those instructions, which have been stored in the map. The insertion of the + // vector bitcast to element type pointer is done at the end of the EntryBlock + // to ensure that any initial stores of vector parameters have been done + // before the cast. + + std::vector::iterator MapIt; + for (auto MapIt : VectorParmMap) { + Instruction *ExpandedCast = MapIt->VectorParmCast; + if (!ExpandedCast->getParent()) { + insertInstruction(ExpandedCast, EntryBlock); + } + } + + // Insert the mask parameter store to alloca and bitcast if this is a masked + // variant. + if (*Mask) { + // Mask points to the bitcast of the alloca instruction to element type + // pointer. Insert the bitcast after all of the other bitcasts for vector + // parameters. + insertInstruction(*Mask, EntryBlock); + + Value *MaskVector = (*Mask)->getOperand(0); + + // MaskParm points to the function's mask parameter. + Function::arg_iterator MaskParm = Clone->arg_end(); + MaskParm--; + + // Find the last parameter store in the function entry block and insert the + // the store of the mask parameter after it. We do this just to make the + // LLVM IR easier to read. If there are no parameters, just insert the store + // before the terminator. For safety, if we cannot find a store, then insert + // this store after the last alloca. At this point, there will at least be + // an alloca for either a parameter or return. This code just ensures that + // the EntryBlock instructions are grouped by alloca, followed by store, + // followed by bitcast for readability reasons. + + StoreInst *MaskStore = new StoreInst(&*MaskParm, MaskVector); + insertInstruction(MaskStore, EntryBlock); + } + + DEBUG(dbgs() << "After Parameter/Return Expansion\n"); + DEBUG(Clone->dump()); + + return ExpandedReturn; +} + +bool VecClone::typesAreCompatibleForLoad(Type *GepType, Type *LoadType) { + // GepType will always be a pointer since this refers to an alloca for a + // vector. + PointerType *GepPtrTy = dyn_cast(GepType); + Type *LoadFromTy = GepPtrTy->getElementType(); + Type *LoadToTy = LoadType; + + // Dereferencing pointers in LLVM IR means that we have to have a load for + // each level of indirection. This means that we load from a gep and the + // resulting load value type is reduced by one level of indirection. For + // example, we load from a gep of i32* to a temp that has an i32 type. We + // cannot do multiple levels of dereferencing in a single load. For example, + // we cannot load from a gep of i32** to an i32. This requires two loads. + // + // Legal Case: GepType = i32**, LoadFromTy = i32*, + // LoadType = i32*, LoadToTy = i32* + // + // %vec.b.elem.2 = load i32*, i32** %vec.b.cast.gep1 + // + // In this case, since both are pointers, types will be considered equal by + // LLVM, so we must continue getting the element types of each pointer type + // until one is no longer a pointer type. Then do an equality check. + // + // Legal Case: GepType = i32*, LoadFromTy = i32, + // LoadType = i32, LoadToTy = i32 + // + // %vec.b.elem.2 = load i32, i32* %vec.b.cast.gep1 + // + // Ready to compare as is + // + // Illegal Case: GepType = i32**, LoadFromTy = i32* + // LoadType = i32, LoadToTy = i32 + // + // %vec.b.elem.2 = load i32, i32** %vec.b.cast.gep1 + // + // This case arises due to differences in the LLVM IR at -O0 and >= -O1. + // For >= -O1, Mem2Reg registerizes parameters and there are no alloca + // instructions created for function parameters. At -O0, vector parameters + // are expanded and we modify the existing alloca that was used for the scalar + // parameter. When there is no alloca for vector parameters, we must create + // one for them. Thus, we have introduced an additional level of indirection + // for users of parameters at >= -O1. This can become a problem for load + // instructions and results in this illegal case. This function helps to + // check that we are not attempting to do an extra level of indirection + // within the load instructions for elements of vector parameters in the + // simd loop. If an illegal case is encountered, an additional load is + // inserted to account for the extra level of indirection and any users are + // updated accordingly. + + while (LoadFromTy->getTypeID() == Type::PointerTyID && + LoadToTy->getTypeID() == Type::PointerTyID) { + + PointerType *FromPtrTy = cast(LoadFromTy); + PointerType *ToPtrTy = cast(LoadToTy); + + LoadFromTy = FromPtrTy->getElementType(); + LoadToTy = ToPtrTy->getElementType(); + } + + if (LoadFromTy->getTypeID() == LoadToTy->getTypeID()) { + return true; + } + + return false; +} + +void VecClone::updateScalarMemRefsWithVector( + Function *Clone, Function &F, BasicBlock *EntryBlock, + BasicBlock *ReturnBlock, PHINode *Phi, + std::vector &VectorParmMap) { + // This function replaces the old scalar uses of a parameter with a reference + // to the new vector one. A gep is inserted using the vector bitcast created + // in the entry block and any uses of the parameter are replaced with this + // gep. The only users that will not be updated are those in the entry block + // that do the initial store to the vector alloca of the parameter. + + std::vector::iterator VectorParmMapIt; + + for (auto VectorParmMapIt : VectorParmMap) { + + SmallVector InstsToUpdate; + Value *Parm = VectorParmMapIt->VectorParm; + Instruction *Cast = VectorParmMapIt->VectorParmCast; + + for (User *U : Parm->users()) { + InstsToUpdate.push_back(dyn_cast(U)); + } + + for (unsigned I = 0; I < InstsToUpdate.size(); ++I) { + + Instruction *User = InstsToUpdate[I]; + if (!(dyn_cast(User) && User->getParent() == EntryBlock)) { + + BitCastInst *BitCast = dyn_cast(Cast); + PointerType *BitCastType = dyn_cast(BitCast->getType()); + Type *PointeeType = BitCastType->getElementType(); + + GetElementPtrInst *VecGep = GetElementPtrInst::Create( + PointeeType, BitCast, Phi, BitCast->getName() + ".gep", User); + + unsigned NumOps = User->getNumOperands(); + for (unsigned I = 0; I < NumOps; ++I) { + if (User->getOperand(I) == Parm) { + + bool TypesAreCompatible = false; + + if (isa(User)) { + TypesAreCompatible = + typesAreCompatibleForLoad(VecGep->getType(), User->getType()); + } + + if ((isa(User) && TypesAreCompatible) || + isa(User)) { + // If the user is a load/store and the dereferencing is legal, + // then just modify the load/store operand to use the gep. + User->setOperand(I, VecGep); + } else { + // Otherwise, we need to load the value from the gep first before + // using it. This effectively loads the particular element from + // the vector parameter. + LoadInst *ParmElemLoad = + new LoadInst(VecGep, "vec." + Parm->getName() + ".elem"); + ParmElemLoad->insertAfter(VecGep); + User->setOperand(I, ParmElemLoad); + } + } + } + } else { + // The user is the parameter store to alloca in the entry block. Replace + // the old scalar alloca with the new vector one. + AllocaInst *VecAlloca = dyn_cast(Cast->getOperand(0)); + User->setOperand(1, VecAlloca); + } + } + } + + DEBUG(dbgs() << "After Alloca Replacement\n"); + DEBUG(Clone->dump()); +} + +Instruction *VecClone::generateStrideForParameter(Function *Clone, + Argument *Arg, + Instruction *ParmUser, + int Stride, PHINode *Phi) { + // For linear values, a mul/add sequence is needed to generate the correct + // value. i.e., val = linear_var * stride + loop_index; + // + // StrideInst is returned as the last instruction needed to update the users + // of the old parameter reference. + Instruction *StrideInst = nullptr; + + // The phi for the loop index is generated by this pass as i32, which is + // why the mul instruction is i32. + Constant *StrideConst = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), Stride); + + Instruction *Mul = BinaryOperator::CreateMul(StrideConst, Phi, "stride.mul"); + + // Insert the stride related instructions after the user if the instruction + // involves a redefinition of the parameter. For example, a load from the + // parameter's associated alloca or a cast. For these situations, we want to + // apply the stride to this Value. For other instructions, e.g., add, the + // instruction computing the stride must be inserted before the usage of it. + + if (!isa(ParmUser)) { + Mul->insertBefore(ParmUser); + } else { + Mul->insertAfter(ParmUser); + } + + if (Arg->getType()->isPointerTy()) { + + // Linear updates to pointer parameters involves an address calculation, so + // use gep. To properly update linear pointers we only need to multiply the + // loop index and stride since gep is indexed starting at 0 from the base + // address passed to the vector function. + PointerType *ParmPtrType = dyn_cast(Arg->getType()); + + // The base address used for linear gep computations. + Value *BaseAddr = nullptr; + StringRef RefName; + + if (LoadInst *ParmLoad = dyn_cast(ParmUser)) { + // We are loading from the alloca of the pointer parameter (no Mem2Reg) + // i.e., loading a pointer to another Value. + BaseAddr = ParmUser; + RefName = ParmLoad->getOperand(0)->getName(); + } else { + // The user is using the pointer parameter directly. + BaseAddr = Arg; + RefName = BaseAddr->getName(); + } + + // Mul is always generated as i32 since it is calculated using the i32 loop + // phi that is inserted by this pass. No cast on Mul is necessary because + // gep can use a base address of one type with an index of another type. + GetElementPtrInst *LinearParmGep = GetElementPtrInst::Create( + ParmPtrType->getElementType(), BaseAddr, Mul, RefName + ".gep"); + + LinearParmGep->insertAfter(Mul); + StrideInst = LinearParmGep; + } else { + // Note: the phi for the loop index is generated by this pass as i32. + // Also, Mul above is generated as i32 because the phi type is always i32. + // However, ParmUser may be another type, so + // + // Generate the instruction that computes the stride. + // + // Example of applying stride: + // + // define float @dowork(float* nocapture readonly %a, float %b, i64 %k) { + // entry: + // %arrayidx = getelementptr inbounds float, float* %a, i64 %k + // %0 = load float, float* %arrayidx, align 4, !tbaa !2 + // %call = tail call float @sinf(float %0) #4 + // %add = fadd float %call, %b + // %conv = sitofp i64 %k to float + // %add1 = fadd float %add, %conv + // ret float %add1 + // } + // + // Case 1: stride for %k must be applied to %conv after %conv so that + // %add1 gets the updated stride value. + // + // Case 2: stride for %k in %arrayidx must be applied before the gep, + // so that the correct index is used. This is the same for other + // non-unary type instructions. + // + BinaryOperator *Add; + Value *StrideVal; + Type *StrideTy; + StringRef TempName = "stride.add"; + if (isa(ParmUser)) { + // Case 1 + StrideVal = ParmUser; + StrideTy = ParmUser->getType(); + } else { + // Case 2 + StrideVal = Arg; + StrideTy = Arg->getType(); + } + + // Stride calculations may need typecasting since the index multiply + // is i32, but the users of the linear value may not be i32. + // + // Example (stride applied to %conv): + // + // %conv = sitofp i64 %k to float + // %stride.mul = mul i32 1, %index + // %stride.cast = bitcast i32 %stride.mul to float + // %stride.add = fadd float %conv, %stride.cast + // %add1 = fadd float %add, %stride.add + // + if (StrideTy != Mul->getType()) { + Instruction *MulConv = + CastInst::CreateSExtOrBitCast(Mul, StrideTy, "stride.cast"); + MulConv->insertAfter(Mul); + Mul = MulConv; + } + + if (StrideTy->isFloatingPointTy()) { + Add = BinaryOperator::CreateFAdd(StrideVal, Mul, TempName); + } else if (StrideTy->isIntegerTy()) { + Add = BinaryOperator::CreateAdd(StrideVal, Mul, TempName); + } else { + llvm_unreachable("Expected integer or floating point type"); + } + + Add->insertAfter(Mul); + StrideInst = Add; + } + + return StrideInst; +} + +void VecClone::updateLinearReferences(Function *Clone, Function &F, + VectorVariant &V, PHINode *Phi) { + // Add stride to parameters marked as linear. This is done by finding all + // users of the scalar alloca associated with the parameter. The user should + // be a load from this alloca to a temp. The stride is then added to this temp + // and its uses are replaced with the new temp. Or, if Mem2Reg eliminates the + // alloca/load, the parameter is used directly and this use is updated with + // the stride. + + Function::arg_iterator ArgListIt = Clone->arg_begin(); + Function::arg_iterator ArgListEnd = Clone->arg_end(); + std::vector ParmKinds = V.getParameters(); + + for (; ArgListIt != ArgListEnd; ++ArgListIt) { + + User::user_iterator ArgUserIt = ArgListIt->user_begin(); + User::user_iterator ArgUserEnd = ArgListIt->user_end(); + unsigned ParmIdx = ArgListIt->getArgNo(); + SmallVector LinearParmUsers; + + if (ParmKinds[ParmIdx].isLinear()) { + + int Stride = ParmKinds[ParmIdx].getStride(); + + for (; ArgUserIt != ArgUserEnd; ++ArgUserIt) { + + // Collect all uses of the parameter so that they can later be used to + // apply the stride. + Instruction *ParmUser = dyn_cast(*ArgUserIt); + if (StoreInst *ParmStore = dyn_cast(ParmUser)) { + + // This code traces the store of the parameter to its associated + // alloca. Then, we look for a load from that alloca to a temp. This + // is the value we need to add the stride to. This is for when + // Mem2Reg has not been run. + AllocaInst *Alloca = dyn_cast(ArgUserIt->getOperand(1)); + + if (Alloca) { + for (auto *AU : Alloca->users()) { + + LoadInst *ParmLoad = dyn_cast(AU); + + if (ParmLoad) { + // The parameter is being loaded from an alloca to a new SSA + // temp. We must replace the users of this load with an + // instruction that adds the result of this load with the + // stride. + LinearParmUsers.push_back(ParmLoad); + } + } + } else { + // Mem2Reg has run, so the parameter is directly referenced in the + // store instruction. + LinearParmUsers.push_back(ParmStore); + } + } else { + // Mem2Reg has registerized the parameters, so users of it will use + // it directly, and not through a load of the parameter. + LinearParmUsers.push_back(ParmUser); + } + } + + for (unsigned I = 0; I < LinearParmUsers.size(); I++) { + // For each user of parameter: + // + // We must deal with two cases here, based on whether Mem2Reg has been + // run. + // + // Example: + // + // __declspec(vector(linear(i:1),uniform(x),vectorlength(4))) + // extern int foo(int i, int x) { + // return (x + i); + // } + // + // 1) We are loading the parameter from an alloca and the SSA temp as + // as a result of the load is what we need to add the stride to. + // Then, any users of that temp must be replaced. The only load + // instructions put in the collection above are guaranteed to be + // associated with the parameter's alloca. Thus, we only need to + // check to see if a load is in the map to know what to do. + // + // Before Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // store i32 %x, i32* %x.addr, align 4 + // %0 = load i32, i32* %x.addr, align 4 + // %1 = load i32, i32* %i.addr, align 4 <--- %i + // %add = add nsw i32 %0, %1 <--- replace %1 with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // After Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // store i32 %x, i32* %x.addr, align 4 + // %0 = load i32, i32* %x.addr, align 4 + // %1 = load i32, i32* %i.addr, align 4 + // %stride.mul = mul i32 1, %index + // %stride.add = add i32 %1, %stride.mul <--- stride + // %add = add nsw i32 %0, %stride.add <--- new %i with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // 2) The user uses the parameter directly, and so we must apply the + // stride directly to the parameter. Any users of the parameter + // must then be updated. + // + // Before Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // %add = add nsw i32 %x, %i <-- direct usage of %i + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + // + // After Linear Update: + // + // simd.loop: ; preds = %simd.loop.exit, %entry + // %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] + // %stride.mul = mul i32 1, %index + // %stride.add = add i32 %i, %stride.mul <--- stride + // %add = add nsw i32 %x, %stride.add <--- new %i with stride + // %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index + // store i32 %add, i32* %ret.cast.gep + // br label %simd.loop.exit + + Instruction *StrideInst = generateStrideForParameter( + Clone, &*ArgListIt, LinearParmUsers[I], Stride, Phi); + + SmallVector InstsToUpdate; + Value *ParmUser; + + if (isa(LinearParmUsers[I])) { + // Case 1 + ParmUser = LinearParmUsers[I]; + User::user_iterator StrideUserIt = LinearParmUsers[I]->user_begin(); + User::user_iterator StrideUserEnd = LinearParmUsers[I]->user_end(); + + // Find the users of the redefinition of the parameter so that we + // can apply the stride to those instructions. + for (; StrideUserIt != StrideUserEnd; ++StrideUserIt) { + Instruction *StrideUser = dyn_cast(*StrideUserIt); + if (StrideUser != StrideInst) { + // We've already inserted the stride which is now also a user of + // the parameter, so don't update that instruction. Otherwise, + // we'll create a self reference. Hence, why we don't use + // replaceAllUsesWith(). + InstsToUpdate.push_back(StrideUser); + } + } + } else { + // Case 2 + ParmUser = &*ArgListIt; + InstsToUpdate.push_back(LinearParmUsers[I]); + } + + // Replace the old references to the parameter with the instruction + // that applies the stride. + for (unsigned J = 0; J < InstsToUpdate.size(); ++J) { + unsigned NumOps = InstsToUpdate[J]->getNumOperands(); + for (unsigned K = 0; K < NumOps; ++K) { + if (InstsToUpdate[J]->getOperand(K) == ParmUser) { + InstsToUpdate[J]->setOperand(K, StrideInst); + } + } + } + } + } + } + + DEBUG(dbgs() << "After Linear Updates\n"); + DEBUG(Clone->dump()); +} + +void VecClone::updateReturnBlockInstructions(Function *Clone, + BasicBlock *ReturnBlock, + Instruction *ExpandedReturn) { + // If the vector function returns void, then there is no need to do any + // packing. The only instruction in the ReturnBlock is 'ret void', so + // we can just leave this instruction and we're done. + if (Clone->getReturnType()->isVoidTy()) + return; + + // Collect all instructions in the return basic block. They will be removed. + SmallVector InstToRemove; + BasicBlock::iterator InstIt = ReturnBlock->begin(); + BasicBlock::iterator InstEnd = ReturnBlock->end(); + + for (; InstIt != InstEnd; ++InstIt) { + InstToRemove.push_back(&*InstIt); + } + + // Remove all instructions from the return block. These will be replaced + // with the instructions necessary to return a vector temp. The verifier + // will complain if we remove the definitions of users first, so remove + // instructions from the bottom up. + for (int I = InstToRemove.size() - 1; I >= 0; I--) { + InstToRemove[I]->eraseFromParent(); + } + + // Pack up the elements into a vector temp and return it. If the return + // vector was bitcast to a pointer to the element type, we must bitcast to + // vector before returning. + Instruction *Return; + if (dyn_cast(ExpandedReturn)) { + // Operand 0 is the actual alloc reference in the bitcast. + AllocaInst *Alloca = dyn_cast(ExpandedReturn->getOperand(0)); + PointerType *PtrVecType = PointerType::get( + Clone->getReturnType(), Alloca->getType()->getAddressSpace()); + BitCastInst *BitCast = + new BitCastInst(ExpandedReturn, PtrVecType, + "vec." + ExpandedReturn->getName(), ReturnBlock); + Return = BitCast; + } else { + Return = ExpandedReturn; + } + + LoadInst *VecReturn = new LoadInst(Return, "vec.ret", ReturnBlock); + ReturnInst::Create(Clone->getContext(), VecReturn, ReturnBlock); + + DEBUG(dbgs() << "After Return Block Update\n"); + DEBUG(Clone->dump()); +} + +int VecClone::getParmIndexInFunction(Function *F, Value *Parm) { + Function::arg_iterator ArgIt = F->arg_begin(); + Function::arg_iterator ArgEnd = F->arg_end(); + for (unsigned Idx = 0; ArgIt != ArgEnd; ++ArgIt, ++Idx) { + if (Parm == &*ArgIt) + return Idx; + } + + return -1; +} + +bool VecClone::isSimpleFunction(Function *Clone, VectorVariant &V, + ReturnInst *ReturnOnly) { + // For really simple functions, there is no need to go through the process + // of inserting a loop. + + // Example: + // + // void foo(void) { + // return; + // } + // + // No need to insert a loop for this case since it's basically a no-op. Just + // clone the function and return. It's possible that we could have some code + // inside of a vector function that modifies global memory. Let that case go + // through. + if (ReturnOnly && Clone->getReturnType()->isVoidTy()) { + return true; + } + + return false; +} + +void VecClone::insertSplitForMaskedVariant(Function *Clone, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi) { + BasicBlock *LoopThenBlock = + LoopBlock->splitBasicBlock(LoopBlock->getFirstNonPHI(), "simd.loop.then"); + + BasicBlock *LoopElseBlock = BasicBlock::Create( + Clone->getContext(), "simd.loop.else", Clone, LoopExitBlock); + + BranchInst::Create(LoopExitBlock, LoopElseBlock); + + BitCastInst *BitCast = dyn_cast(Mask); + PointerType *BitCastType = dyn_cast(BitCast->getType()); + Type *PointeeType = BitCastType->getElementType(); + + GetElementPtrInst *MaskGep = GetElementPtrInst::Create( + PointeeType, Mask, Phi, "mask.gep", LoopBlock->getTerminator()); + + LoadInst *MaskLoad = + new LoadInst(MaskGep, "mask.parm", LoopBlock->getTerminator()); + + Type *CompareTy = MaskLoad->getType(); + Instruction *MaskCmp; + Constant *Zero; + + // Generate the compare instruction to see if the mask bit is on. In ICC, we + // use the movemask intrinsic which takes both float/int mask registers and + // converts to an integer scalar value, one bit representing each element. + // AVR construction will be complicated if this intrinsic is introduced here, + // so the current solution is to just generate either an integer or floating + // point compare instruction for now. This may change anyway if we decide to + // go to a vector of i1 values for the mask. I suppose this would be one + // positive reason to use vector of i1. + if (CompareTy->isIntegerTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0); + MaskCmp = new ICmpInst(LoopBlock->getTerminator(), CmpInst::ICMP_NE, + MaskLoad, Zero, "mask.cond"); + } else if (CompareTy->isFloatingPointTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0.0); + MaskCmp = new FCmpInst(LoopBlock->getTerminator(), CmpInst::FCMP_UNE, + MaskLoad, Zero, "mask.cond"); + } else { + assert(0 && "Unsupported mask compare"); + } + + TerminatorInst *Term = LoopBlock->getTerminator(); + Term->eraseFromParent(); + BranchInst::Create(LoopThenBlock, LoopElseBlock, MaskCmp, LoopBlock); + + DEBUG(dbgs() << "After Split Insertion For Masked Variant\n"); + DEBUG(Clone->dump()); +} + +void VecClone::removeScalarAllocasForVectorParams( + std::vector &VectorParmMap) { + std::vector::iterator VectorParmMapIt; + + for (auto VectorParmMapIt : VectorParmMap) { + Value *Parm = VectorParmMapIt->VectorParm; + if (AllocaInst *ScalarAlloca = dyn_cast(Parm)) { + ScalarAlloca->eraseFromParent(); + } + } +} + +void VecClone::disableLoopUnrolling(BasicBlock *Latch) { + // Set disable unroll metadata on the conditional branch of the loop latch + // for the simd loop. The following is an example of what the loop latch + // and Metadata will look like. The !llvm.loop marks the beginning of the + // loop Metadata and is always placed on the terminator of the loop latch. + // (i.e., simd.loop.exit in this case). According to LLVM documentation, to + // properly set the loop Metadata, the 1st operand of !16 must be a self- + // reference to avoid some type of Metadata merging conflicts that have + // apparently arisen in the past. This is part of LLVM history that I do not + // know. Also, according to LLVM documentation, any Metadata nodes referring + // to themselves are marked as distinct. As such, all Metadata corresponding + // to a loop belongs to that loop alone and no sharing of Metadata can be + // done across different loops. + // + // simd.loop.exit: ; preds = %simd.loop, %if.else, %if.then + // %indvar = add nuw i32 %index, 1 + // %vl.cond = icmp ult i32 %indvar, 2 + // br i1 %vl.cond, label %simd.loop, label %simd.end.region, !llvm.loop !16 + // + // !16 = distinct !{!16, !17} + // !17 = !{!"llvm.loop.unroll.disable"} + + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = Latch->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + Latch->getTerminator()->setMetadata("llvm.loop", NewLoopID); +} + +bool VecClone::runOnModule(Module &M) { + + DEBUG(dbgs() << "\nExecuting SIMD Function Cloning ...\n\n"); + + std::map> FunctionsToVectorize; + getFunctionsToVectorize(M, FunctionsToVectorize); + + // VectorParmMap contains the mapping of the parameter to the bitcast + // instruction that casts the vector alloca for vector parameters to a scalar + // pointer for use in the simd loop. When parameters are registerized, the + // Value* in the map correponds directly to the function parameter. When + // parameters are not registerized, then the Value* in the map is the original + // scalar alloca before expansion. Later, users of the parameter, either + // directly or through the alloca, are replaced with a gep using the bitcast + // of the vector alloca for the parameter and the current loop induction + // variable value. + // + // IMPORTANT NOTE: std::vector was used here because later we emit LLVM + // instructions using the members of ParmRef, and these instructions should be + // ordered consistently for easier testability. + + std::vector VectorParmMap; + + std::map>::iterator VarIt; + std::map>::iterator VarEnd; + for (VarIt = FunctionsToVectorize.begin(), + VarEnd = FunctionsToVectorize.end(); + VarIt != VarEnd; ++VarIt) { + + Function &F = *(VarIt->first); + std::vector Variants = VarIt->second; + TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + + for (unsigned i = 0; i < Variants.size(); i++) { + + VectorVariant Variant(Variants[i], TTI); + + // Clone the original function. + DEBUG(dbgs() << "Before SIMD Function Cloning\n"); + DEBUG(F.dump()); + Function *Clone = CloneFunction(F, Variant); + Function::iterator EntryBlock = Clone->begin(); + BasicBlock::iterator FirstInst = EntryBlock->begin(); + ReturnInst *ReturnOnly = dyn_cast(FirstInst); + + if (isSimpleFunction(Clone, Variant, ReturnOnly)) { + continue; + } + + BasicBlock *LoopBlock = splitEntryIntoLoop(Clone, Variant, &*EntryBlock); + BasicBlock *ReturnBlock = splitLoopIntoReturn(Clone, &Clone->back()); + BasicBlock *LoopExitBlock = createLoopExit(Clone, ReturnBlock); + PHINode *Phi = createPhiAndBackedgeForLoop(Clone, &*EntryBlock, LoopBlock, + LoopExitBlock, ReturnBlock, + Variant.getVlen()); + + // At this point, we've gathered some parameter information and have + // restructured the function into an entry block, a set of blocks + // forming the loop, a loop exit block, and a return block. Now, + // we can go through and update instructions since we know what + // is part of the loop. + + // Create a new vector alloca instruction for all vector parameters and + // return. For parameters, replace the initial store to the old alloca + // with the vector one. Users of the old alloca within the loop will be + // replaced with a gep using this address along with the proper loop + // index. + + Instruction *Mask = NULL; + Instruction *ExpandedReturn = expandVectorParametersAndReturn( + Clone, Variant, &Mask, &*EntryBlock, LoopBlock, ReturnBlock, + VectorParmMap); + updateScalarMemRefsWithVector(Clone, F, &*EntryBlock, ReturnBlock, Phi, + VectorParmMap); + + // Update any linear variables with the appropriate stride. This function + // will insert a mul/add sequence before the use of the parameter. For + // linear pointer parameters, the stride calculation is just a mul + // instruction using the loop induction var and the stride value on the + // parameter. This mul instruction is then used as the index of the gep + // that will be inserted before the next use of the parameter. The + // function also updates the users of the parameter with the new + // calculation involving the stride. + updateLinearReferences(Clone, F, Variant, Phi); + + // Remove the old scalar instructions associated with the return and + // replace with packing instructions. + updateReturnBlockInstructions(Clone, ReturnBlock, ExpandedReturn); + + // Remove the old scalar allocas associated with vector parameters since + // these have now been replaced with vector ones. + removeScalarAllocasForVectorParams(VectorParmMap); + + for (auto *Parm : VectorParmMap) { + delete Parm; + } + VectorParmMap.clear(); + + // If this is the masked vector variant, insert the mask condition and + // if/else blocks. + if (Variant.isMasked()) { + insertSplitForMaskedVariant(Clone, LoopBlock, LoopExitBlock, Mask, Phi); + } + + DEBUG(dbgs() << "After SIMD Function Cloning\n"); + DEBUG(Clone->dump()); + + // Disable unrolling from kicking in on the simd loop. + disableLoopUnrolling(LoopExitBlock); + + } // End of function cloning for the variant + } // End of function cloning for all variants + + return true; // LLVM IR has been modified +} + +void VecClone::print(raw_ostream &OS, const Module *M) const { + // TODO +} + +ModulePass *llvm::createVecClonePass() { return new llvm::VecClone(); } + +char VecClone::ID = 0; + +static const char lv_name[] = "VecClone"; +INITIALIZE_PASS_BEGIN(VecClone, SV_NAME, lv_name, false /* modifies CFG */, + false /* transform pass */) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(VecClone, SV_NAME, lv_name, false /* modififies CFG */, + false /* transform pass */) Index: test/Transforms/VecClone/all_parm_types.ll =================================================================== --- test/Transforms/VecClone/all_parm_types.ll +++ test/Transforms/VecClone/all_parm_types.ll @@ -0,0 +1,46 @@ +; Test all different kinds of parameters (uniform, linear, vector), multiple uses of linear k, and that stride calculations can handle type conversions. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4uvl_dowork +; CHECK: simd.loop: +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %stride.cast{{.*}} = sext i32 %stride.mul{{.*}} +; CHECK: %stride.add{{.*}} = add i64 %k, %stride.cast{{.*}} +; CHECK: %arrayidx = getelementptr inbounds float, float* %a, i64 %stride.add{{.*}} +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %stride.cast{{.*}} = bitcast i32 %stride.mul{{.*}} to float +; CHECK: %stride.add{{.*}} = fadd float %conv, %stride.cast{{.*}} +; CHECK: %add{{.*}} = fadd float %add, %stride.add{{.*}} + +; ModuleID = 'rfc.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define float @dowork(float* %a, float %b, i64 %k) #0 { +entry: + %arrayidx = getelementptr inbounds float, float* %a, i64 %k + %0 = load float, float* %arrayidx, align 4, !tbaa !2 + %call = call float @sinf(float %0) #5 + %add = fadd float %call, %b + %conv = sitofp i64 %k to float + %add1 = fadd float %add, %conv + ret float %add1 +} + +; Function Attrs: nounwind +declare float @sinf(float) #1 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4uvl_dowork,_ZGVcN8uvl_dowork,_ZGVdN8uvl_dowork,_ZGVeN16uvl_dowork,_ZGVbM4uvl_dowork,_ZGVcM8uvl_dowork,_ZGVdM8uvl_dowork,_ZGVeM16uvl_dowork" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"float", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} Index: test/Transforms/VecClone/broadcast.ll =================================================================== --- test/Transforms/VecClone/broadcast.ll +++ test/Transforms/VecClone/broadcast.ll @@ -0,0 +1,19 @@ +; Check broadcast of a constant. The store of the constant should be moved inside of the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4_foo +; CHECK: simd.loop: +; CHECK: store i32 99, i32* %ret.cast.gep + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo() #0 { +entry: + ret i32 99 +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4_foo,_ZGVbN4_foo,_ZGVcM8_foo,_ZGVcN8_foo,_ZGVdM8_foo,_ZGVdN8_foo,_ZGVeM16_foo,_ZGVeN16_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/convert_linear.ll =================================================================== --- test/Transforms/VecClone/convert_linear.ll +++ test/Transforms/VecClone/convert_linear.ll @@ -0,0 +1,32 @@ +; Check handling of upconverting a linear (variable %i) to ensure stride calculation +; is inserted correctly and the old convert (sext) uses the stride instead of the old +; reference to %i. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN2vl_foo +; CHECK: simd.loop: +; CHECK: %0 = load i32, i32* %i.addr +; CHECK-NEXT: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %0, %stride.mul +; CHECK-NEXT: %conv = sext i32 %stride.add to i64 + +; ModuleID = 'convert.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i64 @foo(i64 %x, i32 %i) #0 { +entry: + %x.addr = alloca i64, align 8 + %i.addr = alloca i32, align 4 + store i64 %x, i64* %x.addr, align 8 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %x.addr, align 8 + %add = add nsw i64 %conv, %1 + ret i64 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM2vl_foo,_ZGVbN2vl_foo,_ZGVcM4vl_foo,_ZGVcN4vl_foo,_ZGVdM4vl_foo,_ZGVdN4vl_foo,_ZGVeM8vl_foo,_ZGVeN8vl_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/external_array.ll =================================================================== --- test/Transforms/VecClone/external_array.ll +++ test/Transforms/VecClone/external_array.ll @@ -0,0 +1,35 @@ +; Check to see that we are applying the correct updated linear index for an external array access gep. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4ul_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %idxprom = sext i32 %stride.add to i64 +; CHECK: %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom +; CHECK: store i32 %0, i32* %arrayidx + +; ModuleID = 'external_array_assign.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@ext_a = common global [128 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @foo(i32 %x, i32 %i) #0 { +entry: + %x.addr = alloca i32, align 4 + %i.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom + store i32 %0, i32* %arrayidx, align 4 + ret void +} + +attributes #0 = { norecurse nounwind uwtable "vector-variants"="_ZGVbM4ul_foo,_ZGVbN4ul_foo,_ZGVcM8ul_foo,_ZGVcN8ul_foo,_ZGVdM8ul_foo,_ZGVdN8ul_foo,_ZGVeM16ul_foo,_ZGVeN16ul_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear.ll =================================================================== --- test/Transforms/VecClone/linear.ll +++ test/Transforms/VecClone/linear.ll @@ -0,0 +1,29 @@ +; Check to see that the linear parameter i is updated with the correct stride, indicated by a mul/add instruction sequence after the load. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %add = add nsw i32 %0, %stride.add + +; ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %i.addr = alloca i32, align 4 + %x.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %x, i32* %x.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear_mem2reg.ll =================================================================== --- test/Transforms/VecClone/linear_mem2reg.ll +++ test/Transforms/VecClone/linear_mem2reg.ll @@ -0,0 +1,22 @@ +; Check to see that the linear parameter i is updated with the correct stride when Mem2Reg is on. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %i, %stride.mul +; CHECK-NEXT: %add = add nsw i32 %x, %stride.add + +;ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %add = add nsw i32 %x, %i + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/struct_linear_ptr.ll =================================================================== --- test/Transforms/VecClone/struct_linear_ptr.ll +++ test/Transforms/VecClone/struct_linear_ptr.ll @@ -0,0 +1,40 @@ +; Test that the stride is being applied correctly to struct field accesses. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4l_foo +; CHECK: simd.loop: +; CHECK: %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %0, i32 %stride.mul{{.*}} +; CHECK: %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 0 +; CHECK: %1 = load float, float* %field1, align 8 +; CHECK: %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %2, i32 %stride.mul{{.*}} +; CHECK: %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 4 +; CHECK: %3 = load float, float* %field5, align 8 +; CHECK: %add = fadd float %1, %3 + +; ModuleID = 'struct_linear_ptr.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.my_struct = type { float, i8, i32, i16, float, i64 } + +; Function Attrs: nounwind uwtable +define float @foo(%struct.my_struct* %s) #0 { +entry: + %s.addr = alloca %struct.my_struct*, align 8 + store %struct.my_struct* %s, %struct.my_struct** %s.addr, align 8 + %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %0, i32 0, i32 0 + %1 = load float, float* %field1, align 8 + %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %2, i32 0, i32 4 + %3 = load float, float* %field5, align 8 + %add = fadd float %1, %3 + ret float %add +} + +attributes #0 = { norecurse nounwind readonly uwtable "vector-variants"="_ZGVbM4l_foo,_ZGVbN4l_foo,_ZGVcM8l_foo,_ZGVcN8l_foo,_ZGVdM8l_foo,_ZGVdN8l_foo,_ZGVeM16l_foo,_ZGVeN16l_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum.ll +++ test/Transforms/VecClone/two_vec_sum.ll @@ -0,0 +1,59 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the non-masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; CHECK-LABEL: <4 x i32> @_ZGVbN4vv_vec_sum(<4 x i32> %i, <4 x i32> %j) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mask.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mask.ll +++ test/Transforms/VecClone/two_vec_sum_mask.ll @@ -0,0 +1,71 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; Begin non-masked variant checking + +; CHECK-LABEL: <4 x i32> @_ZGVbM4vv_vec_sum(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.mask = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i, align 4 +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j, align 4 +; CHECK-NEXT: store <4 x i32> %mask, <4 x i32>* %vec.mask +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: %mask.cast = bitcast <4 x i32>* %vec.mask to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %mask.gep = getelementptr i32, i32* %mask.cast, i32 %index +; CHECK-NEXT: %mask.parm = load i32, i32* %mask.gep +; CHECK-NEXT: %mask.cond = icmp ne i32 %mask.parm, 0 +; CHECK-NEXT: br i1 %mask.cond, label %simd.loop.then, label %simd.loop.else + +; CHECK: simd.loop.then: +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.else: +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mem2reg.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mem2reg.ll +++ test/Transforms/VecClone/two_vec_sum_mem2reg.ll @@ -0,0 +1,31 @@ +; Check to be sure that when Mem2Reg is on that all updates to instructions referring to the original +; parameter are updated correctly. When Mem2Reg is on, instructions will refer to the parameters +; directly and not through a load, which is why this is tested separately. + +; Note: the LLVM IR used as input to this test has already had Mem2Reg applied to it, so no need to +; do that here. This happens at higher optimization levels such as -O2. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking + +; CHECK-LABEL: @_ZGVbN4vv_vec_sum +; CHECK: simd.loop: +; CHECK: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK: %vec.i.elem = load i32, i32* %vec.i.cast.gep +; CHECK: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK: %vec.j.elem = load i32, i32* %vec.j.cast.gep +; CHECK: %add = add nsw i32 %vec.i.elem, %vec.j.elem + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %add = add nsw i32 %i, %j + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/uniform.ll =================================================================== --- test/Transforms/VecClone/uniform.ll +++ test/Transforms/VecClone/uniform.ll @@ -0,0 +1,25 @@ +; Check to make sure the initial parameter store of the uniform parameter is sunk into the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: <4 x i32> @_ZGVbN4u_foo(i32 %b) +; CHECK: simd.loop: +; CHECK: store i32 %b + +; ModuleID = 'uniform.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %b) #0 { +entry: + %b.addr = alloca i32, align 4 + store i32 %b, i32* %b.addr, align 4 + %0 = load i32, i32* %b.addr, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %b.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + ret i32 %1 +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4u_foo,_ZGVbN4u_foo,_ZGVcM8u_foo,_ZGVcN8u_foo,_ZGVdM8u_foo,_ZGVdN8u_foo,_ZGVeM16u_foo,_ZGVeN16u_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/vector_ptr.ll =================================================================== --- test/Transforms/VecClone/vector_ptr.ll +++ test/Transforms/VecClone/vector_ptr.ll @@ -0,0 +1,25 @@ +; Test that vector of pointers are handled with correctly in loop and that incompatible function return/arg attributes are removed. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN2v_dowork +; CHECK: simd.loop: +; CHECK: %vec.p.cast.gep = getelementptr float*, float** %vec.p.cast, i32 %index +; CHECK: %vec.p.elem = load float*, float** %vec.p.cast.gep +; CHECK: %add.ptr = getelementptr inbounds float, float* %vec.p.elem, i64 1 +; CHECK: %ret.cast.gep = getelementptr float*, float** %ret.cast, i32 %index +; CHECK: store float* %add.ptr, float** %ret.cast.gep + +source_filename = "vector_ptr.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readnone uwtable +define nonnull float* @dowork(float* readnone %p) local_unnamed_addr #0 { +entry: + %add.ptr = getelementptr inbounds float, float* %p, i64 1 + ret float* %add.ptr +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN2v_dowork,_ZGVcN4v_dowork,_ZGVdN4v_dowork,_ZGVeN8v_ +dowork,_ZGVbM2v_dowork,_ZGVcM4v_dowork,_ZGVdM4v_dowork,_ZGVeM8v_dowork" } Index: test/Transforms/VecClone/void_foo.ll =================================================================== --- test/Transforms/VecClone/void_foo.ll +++ test/Transforms/VecClone/void_foo.ll @@ -0,0 +1,19 @@ +; Check to make sure we can handle void foo() function + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: void @_ZGVbN4_foo() +; CHECK: entry: +; CHECK: ret void + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @foo() #0 { +entry: + ret void +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4_foo1,_ZGVbN4_foo1,_ZGVcM8_foo1,_ZGVcN8_foo1,_ZGVdM8_foo1,_ZGVdN8_foo1,_ZGVeM16_foo1,_ZGVeN16_foo1" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: tools/bugpoint/bugpoint.cpp =================================================================== --- tools/bugpoint/bugpoint.cpp +++ tools/bugpoint/bugpoint.cpp @@ -134,6 +134,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); #ifdef LINK_POLLY_INTO_TOOLS polly::initializePollyPasses(Registry); Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -389,6 +389,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. initializeScalarizeMaskedMemIntrinPass(Registry);