Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -29,6 +29,7 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/DataTypes.h" #include +#include namespace llvm { @@ -628,6 +629,9 @@ /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \brief Default ISA for vector functions. + static const int UnknownISA = -1; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -887,6 +891,31 @@ unsigned ChainSizeInBytes, VectorType *VecTy) const; + /// \returns The maximum vector register width for \p IsaClass. + unsigned getISAClassMaxRegisterWidth(int ISAClass) const; + + /// \returns The ISA class as a string. + std::string isaClassToString(int ISAClass) const; + + /// \returns The ISAClass based on the maximum vector register size supported + /// by the target. + int getISAClassForMaxVecRegSize() const; + + /// \returns The maximum vector register width based on ISAClass \p Class, + /// as defined in the vector function ABI. + unsigned maximumSizeofISAClassVectorRegister(int ISAClass, Type *Ty) const; + + /// \returns The encoded ISA class for the mangled vector variant name based + /// on \p IsaClass. + char encodeISAClass(int ISAClass) const; + + /// \returns The ISAClass from the character encoded \p IsaClass of the + /// mangled vector variant function name. + int decodeISAClass(char ISAClass) const; + + /// \returns The target legalized type of \P Ty based on ISAClass \p IsaClass. + Type* promoteToSupportedType(Type *Ty, int ISAClass) const; + /// Flags describing the kind of vector reduction. struct ReductionFlags { ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {} @@ -1088,6 +1117,14 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual unsigned getISAClassMaxRegisterWidth(int ISAClass) const = 0; + virtual std::string isaClassToString(int ISAClass) const = 0; + virtual int getISAClassForMaxVecRegSize() const = 0; + virtual unsigned maximumSizeofISAClassVectorRegister(int ISAClass, + Type *Ty) const = 0; + virtual char encodeISAClass(int ISAClass) const = 0; + virtual int decodeISAClass(char ISAClass) const = 0; + virtual Type* promoteToSupportedType(Type *Ty, int ISAClass) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; @@ -1450,6 +1487,28 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + unsigned getISAClassMaxRegisterWidth(int ISAClass) const override { + return Impl.getISAClassMaxRegisterWidth(ISAClass); + } + std::string isaClassToString(int ISAClass) const override { + return Impl.isaClassToString(ISAClass); + } + int getISAClassForMaxVecRegSize() const override { + return Impl.getISAClassForMaxVecRegSize(); + } + unsigned maximumSizeofISAClassVectorRegister(int ISAClass, + Type *Ty) const override { + return Impl.maximumSizeofISAClassVectorRegister(ISAClass, Ty); + } + char encodeISAClass(int ISAClass) const override { + return Impl.encodeISAClass(ISAClass); + } + int decodeISAClass(char ISAClass) const override { + return Impl.decodeISAClass(ISAClass); + } + Type* promoteToSupportedType(Type *Ty, int ISAClass) const override { + return Impl.promoteToSupportedType(Ty, ISAClass); + } bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -24,6 +24,7 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" +#include namespace llvm { @@ -529,6 +530,35 @@ return VF; } + unsigned getISAClassMaxRegisterWidth(int ISAClass) const { + return 0; + } + + std::string isaClassToString(int ISAClass) const { + return "Unknown ISA"; + } + + int getISAClassForMaxVecRegSize() const { + return TTI::UnknownISA; + } + + // Used by VectorVariant to determine the VF of the simd function. + unsigned maximumSizeofISAClassVectorRegister(int ISAClass, Type *Ty) const { + return 0; + } + + char encodeISAClass(int ISAClass) const { + return '?'; + } + + int decodeISAClass(char ISAClass) const { + return TTI::UnknownISA; + } + + Type* promoteToSupportedType(Type *Ty, int ISAClass) const { + return Ty; + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; Index: include/llvm/Analysis/VectorVariant.h =================================================================== --- include/llvm/Analysis/VectorVariant.h +++ include/llvm/Analysis/VectorVariant.h @@ -0,0 +1,228 @@ +//===---- llvm/Transforms/VectorVariant.h - Vector utilities -*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This header file defines the VectorVariant class and implements the encoding +/// and decoding utilities for VectorVariant objects. Multiple VectorVariant +/// objects can be created (masked, non-masked, etc.) and associated with the +/// original scalar function. These objects are then used to clone new functions +/// that can be vectorized. This class follows the standards defined in the +/// vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H +#define LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H + +#include +#include +#include +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" + +static const int NOT_ALIGNED = 1; +static const int POSITIVE = 1; +static const int NEGATIVE = -1; + +namespace llvm { + +class VectorKind { + +public: + enum ParmKind { + StrideParmKind = 's', + LinearParmKind = 'l', + UniformParmKind = 'u', + VectorParmKind = 'v' + }; + + VectorKind(char K, int S, int A = NOT_ALIGNED) { + + assert((S == notAValue() || K == StrideParmKind || K == LinearParmKind) && + "only linear vectors have strides"); + + assert((K != LinearParmKind || S != notAValue()) && + "linear vectors must have a stride"); + + assert((K != StrideParmKind || S != notAValue()) && + "variable stride vectors must have a stride"); + + assert((K != StrideParmKind || S >= 0) && + "variable stride position must be non-negative"); + + assert(A > 0 && "alignment must be positive"); + + Kind = K; + Stride = S; + Alignment = A; + } + + VectorKind(const VectorKind &Other) { + Kind = Other.Kind; + Stride = Other.Stride; + Alignment = Other.Alignment; + } + + /// \brief Is the stride for a linear parameter a uniform variable? (i.e., + /// the stride is stored in a variable but is uniform) + bool isVariableStride() { return Kind == StrideParmKind; } + + /// \brief Is the stride for a linear variable non-unit stride? + bool isNonUnitStride() { return Kind == LinearParmKind && Stride != 1; } + + /// \brief Is the stride for a linear variable unit stride? + bool isUnitStride() { return Kind == LinearParmKind && Stride == 1; } + + /// \brief Is this a linear parameter? + bool isLinear() { + return isVariableStride() || isNonUnitStride() || isUnitStride(); + } + + /// \brief Is this a uniform parameter? + bool isUniform() { return Kind == UniformParmKind; } + + /// \brief Is this a vector parameter? + bool isVector() { return Kind == VectorParmKind; } + + /// \brief Is the parameter aligned? + bool isAligned() { return Alignment != NOT_ALIGNED; } + + /// \brief Get the stride associated with a linear parameter. + int getStride() { return Stride; } + + /// \brief Get the alignment associated with a linear parameter. + int getAlignment() { return Alignment; } + + /// \brief Represents a don't care value for strides of parameters other + /// than linear parameters. + static int notAValue() { return -1; } + + /// \brief Encode the parameter information into a mangled string + /// corresponding to the standards defined in the vector function ABI. + std::string encode() { + std::stringstream SST; + SST << Kind; + + if (isNonUnitStride()) { + if (Stride >= 0) + SST << Stride; + else + SST << "n" << -Stride; + } + + if (isVariableStride()) + SST << Stride; + + if (isAligned()) + SST << 'a' << Alignment; + + return SST.str(); + } + +private: + char Kind; // linear, uniform, vector + int Stride; + int Alignment; +}; + +class VectorVariant { + +private: + const TargetTransformInfo *TTI; + int ISAClass; + bool Mask; + unsigned int Vlen; + std::vector Parameters; + + static std::string prefix() { return "_ZGV"; } + +public: + VectorVariant(StringRef FuncName, const TargetTransformInfo *TTI); + + /// \brief Get the ISA corresponding to this vector variant. + int getISA() { return ISAClass; } + + /// \brief Is this a masked vector function variant? + bool isMasked() { return Mask; } + + /// \brief Get the vector length of the vector variant. + unsigned int getVlen() { return Vlen; } + + /// \brief Get the parameters of the vector variant. + std::vector &getParameters() { return Parameters; } + + /// \brief Build the mangled name for the vector variant. This function + /// builds a mangled name by including the encodings for the ISA class, + /// mask information, and all parameters. + std::string encode() { + + std::stringstream SST; + SST << prefix() << TTI->encodeISAClass(ISAClass) << encodeMask(Mask) << Vlen; + + std::vector::iterator It = Parameters.begin(); + std::vector::iterator End = Parameters.end(); + + if (isMasked()) + End--; // mask parameter is not encoded + + for (; It != End; ++It) + SST << (*It).encode(); + + SST << "_"; + + return SST.str(); + } + + /// \brief Generate a function name corresponding to a vector variant. + std::string generateFunctionName(StringRef ScalarFuncName) { + std::string Name = encode(); + return Name + ScalarFuncName.str(); + } + + /// \brief Some targets do not support particular types, so promote to a type + /// that is supported. + Type *promoteToSupportedType(Type *Ty) { + return TTI->promoteToSupportedType(Ty, getISA()); + } + + /// \brief Check to see if this is a vector variant based on the function + /// name. + static bool isVectorVariant(StringRef FuncName) { + return FuncName.startswith(prefix()); + } + + /// \brief Encode the mask information for the mangled variant name. + static char encodeMask(bool EncodeMask) { + + return EncodeMask ? 'M' : 'N'; + } + + /// \brief Decode the mask information from the mangled variant name. + static bool decodeMask(char MaskToDecode) { + + switch (MaskToDecode) { + case 'M': + return true; + case 'N': + return false; + } + + llvm_unreachable("unsupported mask"); + } + + /// \brief Calculate the vector length for the vector variant. + unsigned calcVlen(int ISAClass, Type *Ty); +}; + +} // llvm namespace + +#endif // LLVM_TRANSFORMS_UTILS_INTEL_VECTORVARIANT_H Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -377,6 +377,7 @@ void initializeWriteBitcodePassPass(PassRegistry&); void initializeWriteThinLTOBitcodePass(PassRegistry&); void initializeXRayInstrumentationPass(PassRegistry&); +void initializeVecClonePass(PassRegistry&); } // end namespace llvm Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -48,6 +48,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/VecClone.h" #include "llvm/Transforms/Vectorize.h" #include @@ -207,6 +208,7 @@ (void) llvm::createFloat2IntPass(); (void) llvm::createEliminateAvailableExternallyPass(); (void) llvm::createScalarizeMaskedMemIntrinPass(); + (void) llvm::createVecClonePass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); Index: include/llvm/Transforms/Utils/VecClone.h =================================================================== --- include/llvm/Transforms/Utils/VecClone.h +++ include/llvm/Transforms/Utils/VecClone.h @@ -0,0 +1,122 @@ +//===-------------- VecClone.h - Class definition -*- C++ -*---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This file defines the VecClone pass class. +/// +// ===--------------------------------------------------------------------=== // + +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +#ifndef LLVM_TRANSFORMS_VPO_VECCLONE_H +#define LLVM_TRANSFORMS_VPO_VECCLONE_H + +namespace llvm { + +class ModulePass; + +/// \brief Contains the names of the declared vector function variants +typedef std::vector DeclaredVariants; + +/// \brief Contains a mapping of a function to its vector function variants +typedef std::map FunctionVariants; + +struct VecClonePass : public PassInfoMixin { + +public: + /// \brief Get all functions marked for vectorization in module and their + /// list of variants. + void getFunctionsToVectorize(Module &M, FunctionVariants &FuncVars); + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + // Glue for old PM + bool runImpl(Module &M, Function &F, VectorVariant &Variant); + +private: + /// \brief Returns a floating point or integer constant depending on Ty. + template + Constant* getConstantValue(Type *Ty, LLVMContext &Context, T Val); + + /// \brief Make a copy of the function if it requires a vector variant. + Function* CloneFunction(Module &M, Function &F, VectorVariant &V); + + /// \brief Update the users of vector and linear parameters. Vector + /// parameters must be now be indexed to reference the appropriate + /// element and for linear parameters the stride will be added. + void updateParameterUsers(Function *Clone, VectorVariant &Variant, + BasicBlock &EntryBlock, PHINode *Phi, + const DataLayout &DL); + + /// \brief Performs a translation of a -> &a[i] for widened alloca + /// instructions within the loop body of a simd function. + void updateAllocaUsers(Function *Clone, PHINode *Phi, + DenseMap &AllocaMap); + + /// \brief Widen alloca instructions. Vector parameters will have a vector + /// alloca of size VF and and linear/uniform parameters will have an array + /// alloca of size VF. + void widenAllocaInstructions( + Function *Clone, + DenseMap &AllocaMap, + BasicBlock &EntryBlock, + VectorVariant &Variant, + const DataLayout &DL); + + /// \brief Generate a loop around the function body. + PHINode* generateLoopForFunctionBody(Function *Clone, + BasicBlock *EntryBlock, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + BasicBlock *ReturnBlock, + int VF); + + /// \brief Remove any incompatible parameter attributes as a result of + /// widening vector parameters. + void removeIncompatibleAttributes(Function *Clone); + + /// \brief Check to see if the function is simple enough that a loop does + /// not need to be inserted into the function. + bool isSimpleFunction(Function *Clone, BasicBlock &EntryBlock); + + /// \brief Inserts the if/else split and mask condition for masked SIMD + /// functions. + void insertSplitForMaskedVariant(Function *Clone, BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi); + + /// \brief Adds metadata to the conditional branch of the simd loop latch to + /// prevent loop unrolling and to force vectorization at VF. + void addLoopMetadata(BasicBlock *Latch, unsigned VF); +}; + +class VecClone : public ModulePass { + + bool runOnModule(Module &M) override; + + public: + static char ID; + VecClone(); + void print(raw_ostream &OS, const Module * = nullptr) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + VecClonePass Impl; + +}; // end pass class + +ModulePass *createVecClonePass(); + +} // end llvm namespace + +#endif // LLVM_TRANSFORMS_VPO_VECCLONE_H Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -84,6 +84,7 @@ ValueLatticeUtils.cpp ValueTracking.cpp VectorUtils.cpp + VectorVariant.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -584,6 +584,38 @@ return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +unsigned +TargetTransformInfo::getISAClassMaxRegisterWidth(int ISAClass) const { + return TTIImpl->getISAClassMaxRegisterWidth(ISAClass); +} + +std::string TargetTransformInfo::isaClassToString(int ISAClass) const { + return TTIImpl->isaClassToString(ISAClass); +} + +int TargetTransformInfo::getISAClassForMaxVecRegSize() const { + return TTIImpl->getISAClassForMaxVecRegSize(); +} + +unsigned TargetTransformInfo::maximumSizeofISAClassVectorRegister( + int ISAClass, Type *Ty) const { + + return TTIImpl->maximumSizeofISAClassVectorRegister(ISAClass, Ty); +} + +char TargetTransformInfo::encodeISAClass(int ISAClass) const { + return TTIImpl->encodeISAClass(ISAClass); +} + +int TargetTransformInfo::decodeISAClass(char ISAClass) const { + return TTIImpl->decodeISAClass(ISAClass); +} + +Type* TargetTransformInfo::promoteToSupportedType(Type *Ty, + int ISAClass) const { + return TTIImpl->promoteToSupportedType(Ty, ISAClass); +} + bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); Index: lib/Analysis/VectorVariant.cpp =================================================================== --- lib/Analysis/VectorVariant.cpp +++ lib/Analysis/VectorVariant.cpp @@ -0,0 +1,117 @@ +//===---------- VectorVariant.cpp - Vector function ABI -*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the VectorVariant class and corresponding utilities. +/// VectorVariant objects are associated with a scalar function and are used +/// to generate new functions that can be vectorized. VectorVariants are +/// determined by inspecting the function attributes associated with the scalar +/// function. When a mangled function name is found in the attributes (indicated +/// as "_ZGV"), a VectorVariant object is created. The class and utilities +/// in this file follow the standards defined in the vector function ABI. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +/// \brief Generate a vector variant by decoding the mangled string for the +/// variant contained in the original scalar function's attributes. For +/// example: "_ZGVxN4". The name mangling is defined in the vector function +/// ABI. Based on this string, the parameter kinds (uniform, linear, vector), +/// vector length, parameter alignment, and masking are determined. +VectorVariant::VectorVariant(StringRef FuncName, const TargetTransformInfo *TTI) + : TTI(TTI) { + + assert(isVectorVariant(FuncName) && "invalid vector variant format"); + + std::stringstream SST(FuncName.drop_front(prefix().size())); + + // mandatory annotations + char EncodedISA; + SST.get(EncodedISA); + ISAClass = TTI->decodeISAClass(EncodedISA); + + char EncodedMask; + SST.get(EncodedMask); + Mask = decodeMask(EncodedMask); + SST >> Vlen; + + // optional parameter annotations + while (SST.peek() != '_') { + + char Kind; + int Stride = VectorKind::notAValue(); + int StrideSign = POSITIVE; + int Alignment = NOT_ALIGNED; + + // Get parameter kind + SST.get(Kind); + + // Default stride for linear is 1. If the stride for a parameter is 1, + // then the front-end will not encode it and we will not have set the + // correct stride below. + if (Kind == VectorKind::LinearParmKind) + Stride = 1; + + // Handle optional stride + if (SST.peek() == 'n') { + // Stride is negative + SST.ignore(1); + StrideSign = NEGATIVE; + } + + if (std::isdigit(SST.peek())) { + // Extract constant stride + SST >> Stride; + assert((Kind != VectorKind::StrideParmKind || Stride >= 0) && + "variable stride argument index cannot be negative"); + } + + Stride *= StrideSign; + // Handle optional alignment + if (SST.peek() == 'a') { + SST.ignore(1); + SST >> Alignment; + } + + VectorKind VecKind(Kind, Stride, Alignment); + Parameters.push_back(VecKind); + } + + if (Mask) { + // Masked variants will have an additional mask parameter + VectorKind VecKind(VectorKind::VectorParmKind, VectorKind::notAValue()); + Parameters.push_back(VecKind); + } +} + +/// \brief Determine the vector variant's vector length based on the +/// characteristic data type defined in the vector function ABI and target +/// vector register width. +unsigned int VectorVariant::calcVlen(int ISAClass, + Type *CharacteristicDataType) { + assert(CharacteristicDataType && + CharacteristicDataType->getPrimitiveSizeInBits() != 0 && + "expected characteristic data type to have a primitive size in bits"); + + unsigned int VectorRegisterSize = + TTI->maximumSizeofISAClassVectorRegister(ISAClass, + CharacteristicDataType); + + assert(VectorRegisterSize != 0 && "could not find vector register size for " + "ISAClass - check to make sure it's " + "supported in TTI"); + + return VectorRegisterSize / CharacteristicDataType->getPrimitiveSizeInBits(); +} Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -142,6 +142,7 @@ #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/SimplifyInstructions.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" +#include "llvm/Transforms/Utils/VecClone.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" @@ -728,6 +729,10 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. OptimizePM.addPass(LoopDistributePass()); + // Insert a loop with VF trip count around the body of functions that are + // vector variants. + MPM.addPass(VecClonePass()); + // Now run the core loop vectorizer. OptimizePM.addPass(LoopVectorizePass()); Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -72,6 +72,7 @@ MODULE_PASS("sample-profile", SampleProfileLoaderPass()) MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass()) +MODULE_PASS("vec-clone", VecClonePass()) MODULE_PASS("verify", VerifierPass()) #undef MODULE_PASS Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -58,6 +58,15 @@ /// \name Vector TTI Implementations /// @{ + /// ISA classes defined in the vector function ABI. + enum ISAClass { + SSE, + AVX, + AVX2, + AVX512, + ISAClassesNum + }; + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; @@ -129,6 +138,15 @@ const Function *Callee) const; bool enableMemCmpExpansion(unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); + + unsigned getISAClassMaxRegisterWidth(int ISAClass) const; + std::string isaClassToString(int ISAClass) const; + int getISAClassForMaxVecRegSize() const; + unsigned maximumSizeofISAClassVectorRegister(int ISAClass, + Type *Ty) const; + char encodeISAClass(int ISAClass) const; + int decodeISAClass(char ISAClass) const; + Type* promoteToSupportedType(Type *Ty, int ISAClass) const; private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2808,3 +2808,134 @@ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } + +unsigned X86TTIImpl::getISAClassMaxRegisterWidth(int ISAClass) const { + switch (ISAClass) { + case SSE: + return 128; + case AVX: + case AVX2: + return 256; + case AVX512: + return 512; + default: + llvm_unreachable("unsupported ISA class"); + } +} + +std::string X86TTIImpl::isaClassToString(int ISAClass) const { + switch (ISAClass) { + case SSE: + return "SSE"; + case AVX: + return "AVX"; + case AVX2: + return "AVX2"; + case AVX512: + return "AVX512"; + default: + llvm_unreachable("Unknown ISA class"); + } +} + +int X86TTIImpl::getISAClassForMaxVecRegSize() const { + ISAClass TargetIsaClass; + unsigned TargetMaxRegWidth = getRegisterBitWidth(true); + switch (TargetMaxRegWidth) { + case 128: + TargetIsaClass = SSE; + break; + case 256: + if (ST->hasAVX2()) + TargetIsaClass = AVX2; + else + TargetIsaClass = AVX; + break; + case 512: + TargetIsaClass = AVX512; + break; + default: + llvm_unreachable("Invalid target vector register width"); + } + return TargetIsaClass; +} + +unsigned X86TTIImpl::maximumSizeofISAClassVectorRegister(int ISAClass, + Type *Ty) const { + + assert((Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() || + Ty->isPointerTy()) && + "unsupported type"); + + unsigned int VectorRegisterSize = 0; + + switch (ISAClass) { + case SSE: + VectorRegisterSize = 128; + break; + case AVX: + if (Ty->isIntegerTy() || Ty->isPointerTy()) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case AVX2: + if (Ty->isIntegerTy(8)) + VectorRegisterSize = 128; + else + VectorRegisterSize = 256; + break; + case AVX512: + VectorRegisterSize = 512; + break; + default: + llvm_unreachable("unknown isa class"); + return 0; + } + + assert(VectorRegisterSize != 0 && "unsupported ISA/type combination"); + return VectorRegisterSize; +} + +char X86TTIImpl::encodeISAClass(int ISAClass) const { + switch (ISAClass) { + case SSE: + return 'b'; + case AVX: + return 'c'; + case AVX2: + return 'd'; + case AVX512: + return 'e'; + default: + break; + } + + assert(false && "unsupported ISA class"); + return '?'; +} + +int X86TTIImpl::decodeISAClass(char ISAClass) const { + switch (ISAClass) { + case 'b': + return SSE; + case 'c': + return AVX; + case 'd': + return AVX2; + case 'e': + return AVX512; + default: + llvm_unreachable("unsupported ISA class"); + return SSE; + } +} + +Type* X86TTIImpl::promoteToSupportedType(Type *Ty, int ISAClass) const { + // On ZMM promote char and short to int + if (ISAClass == AVX512 && (Ty->isIntegerTy(8) || + Ty->isIntegerTy(16))) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Utils/VecClone.h" using namespace llvm; @@ -426,6 +427,9 @@ // new unnamed globals. if (PrepareForThinLTO) MPM.add(createNameAnonGlobalPass()); + + MPM.add(createVecClonePass()); + return; } @@ -588,6 +592,10 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); + // Insert a VF trip count loop around the body of functions that have vector + // variants. + MPM.add(createVecClonePass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -52,6 +52,7 @@ UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp + VecClone.cpp VNCoercion.cpp ADDITIONAL_HEADER_DIRS Index: lib/Transforms/Utils/VecClone.cpp =================================================================== --- lib/Transforms/Utils/VecClone.cpp +++ lib/Transforms/Utils/VecClone.cpp @@ -0,0 +1,893 @@ +//=------- VecClone.cpp - Vector function to loop transform -*- C++ -*-------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ===--------------------------------------------------------------------=== // +/// +/// \file +/// This pass inserts the body of a vector function inside a vector length +/// trip count scalar loop for functions that are declared SIMD. The pass +/// currently follows the gcc vector ABI requirements for name mangling +/// encodings, but will be extended in the future to also support the Intel +/// vector ABI. References to both ABIs can be found here: +/// +/// https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt +/// https://software.intel.com/sites/default/files/managed/b4/c8/Intel-Vector-Function-ABI.pdf +/// +/// Conceptually, this pass performs the following transformation: +/// +/// Before Translation: +/// +/// main.cpp +/// +/// #pragma omp declare simd uniform(a) linear(k) +/// extern float dowork(float *a, float b, int k); +/// +/// float a[4096]; +/// float b[4096]; +/// int main() { +/// int k; +/// for (k = 0; k < 4096; k++) { +/// b[k] = k; +/// } +/// #pragma clang loop vectorize(enable) +/// for (k = 0; k < 4096; k++) { +/// a[k] = k * 0.5; +/// a[k] = dowork(a, b[k], k); +/// } +/// } +/// +/// dowork.cpp +/// +/// #pragma omp declare simd uniform(a) linear(k) #0 +/// float dowork(float *a, float b, int k) { +/// return sinf(a[k]) + b; +/// } +/// +/// attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4uvl_", +/// "ZGVbN4uvl_", ... } +/// +/// After Translation: +/// +/// dowork.cpp +/// +/// // Non-masked variant +/// +/// "_ZGVbN4uvl_dowork(float *a, b, int k) { +/// alloc vec_ret; +/// alloc vec_b; +/// // casts from vector to scalar pointer allows loop to be in a scalar form +/// // that can be vectorized easily. +/// ret_cast = bitcast * vec_ret to float*; +/// vec_b_cast = bitcast * vec_b to float*; +/// store b, * vec_b; +/// for (int i = 0; i < VL; i++, k++) { +/// ret_cast[i] = sinf(a[k]) + vec_b_cast[i]; +/// } +/// return vec_ret; +/// } +/// +/// // Masked variant +/// +/// "_ZGVbM4uvl_dowork(float *a, b, int k, +/// mask) { +/// alloc vec_ret; +/// alloc vec_b; +/// ret_cast = bitcast * vec_ret to float*; +/// vec_b_cast = bitcast * vec_b to float*; +/// store b, * vec_b; +/// for (int i = 0; i < VL; i++, k++) { +/// if (mask[i] != 0) +/// ret_cast[i] = sinf(a[k]) + vec_b_cast[i]; +/// } +/// return vec_ret; +/// } +/// +// ===--------------------------------------------------------------------=== // + +// This pass is flexible enough to recognize whether or not parameters have been +// registerized so that the users of the parameter can be properly updated. For +// instance, we need to know where the users of linear parameters are so that +// the stride can be added to them. +// +// In the following example, %i and %x are used directly by %add directly, so +// in this case the pass can just look for users of %i and %x. +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %add = add nsw i32 %x, %i +// ret i32 %add +// } +// +// When parameters have not been registerized, parameters are used indirectly +// through a store/load of the parameter to/from memory that has been allocated +// for them in the function. Thus, in this case, the pass looks for users of +// %0 and %1. +// +// define i32 @foo(i32 %i, i32 %x) #0 { +// entry: +// %i.addr = alloca i32, align 4 +// %x.addr = alloca i32, align 4 +// store i32 %i, i32* %i.addr, align 4 +// store i32 %x, i32* %x.addr, align 4 +// %0 = load i32, i32* %x.addr, align 4 +// %1 = load i32, i32* %i.addr, align 4 +// %add = add nsw i32 %0, %1 +// ret i32 %add +// } +// +// The pass must run at all optimization levels because it is possible that +// a loop calling the vector function is vectorized, but the vector function +// itself is not vectorized. For example, above main.cpp may be compiled at +// -O2, but dowork.cpp may be compiled at -O0. Therefore, it is required that +// the attribute list for the vector function specify all variants that must +// be generated by this pass so as to avoid any linking problems. This pass +// also serves to canonicalize the input IR to the loop vectorizer. + +#include "llvm/Transforms/Utils/VecClone.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/VectorVariant.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include + +#define SV_NAME "vec-clone" +#define DEBUG_TYPE "VecClone" + +using namespace llvm; + +VecClone::VecClone() : ModulePass(ID) {} + +void VecClone::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + +void VecClonePass::getFunctionsToVectorize(llvm::Module &M, + FunctionVariants &FuncVars) { + + // FuncVars will contain a one-to-many mapping between the original scalar + // function and the vector variant encoding strings (represented as + // attributes). The encodings correspond to functions that will be created by + // the caller of this function as vector versions of the original function. + // For example, if foo() is a function marked as a simd function, it will have + // several vector variant encodings like: "_ZGVbM4_foo", "_ZGVbN4_foo", + // "_ZGVcM8_foo", "_ZGVcN8_foo", "_ZGVdM8_foo", "_ZGVdN8_foo", "_ZGVeM16_foo", + // "_ZGVeN16_foo". The caller of this function will then clone foo() and name + // the clones using the above name manglings. The variant encodings correspond + // to differences in masked/non-masked execution, vector length, and target + // vector register size, etc. For more details, please refer to the vector + // function abi references listed at the top of this file. + + for (auto &F : M.functions()) { + if (F.hasFnAttribute("vector-variants")) { + Attribute Attr = F.getFnAttribute("vector-variants"); + StringRef VariantsStr = Attr.getValueAsString(); + SmallVector Variants; + VariantsStr.split(Variants, ','); + for (auto V : Variants) + FuncVars[&F].push_back(V); + } + } +} + +template Constant * +VecClonePass::getConstantValue(Type *Ty, LLVMContext &Context, int Val); +template Constant * +VecClonePass::getConstantValue(Type *Ty, LLVMContext &Context, float Val); +template Constant * +VecClonePass::getConstantValue(Type *Ty, LLVMContext &Context, double Val); +template +Constant *VecClonePass::getConstantValue(Type *Ty, LLVMContext &Context, T Val) { + Constant *ConstVal = nullptr; + + if (Ty->isIntegerTy()) { + ConstVal = ConstantInt::get(Ty, Val); + } else if (Ty->isFloatTy()) { + ConstVal = ConstantFP::get(Ty, Val); + } + + assert(ConstVal && "Could not generate constant for type"); + + return ConstVal; +} + +Function *VecClonePass::CloneFunction(Module &M, Function &F, + VectorVariant &V) { + + std::string VariantName = V.generateFunctionName(F.getName()); + if (M.getFunction(VariantName)) + return nullptr; + + FunctionType *OrigFunctionType = F.getFunctionType(); + Type *ReturnType = F.getReturnType(); + Type *CharacteristicType = calcCharacteristicType(F, V); + + // Expand return type to vector. + if (!ReturnType->isVoidTy()) + ReturnType = VectorType::get(ReturnType, V.getVlen()); + + std::vector ParmKinds = V.getParameters(); + SmallVector ParmTypes; + std::vector::iterator VKIt = ParmKinds.begin(); + for (auto *ParamTy : OrigFunctionType->params()) { + if (VKIt->isVector()) + ParmTypes.push_back( + VectorType::get(ParamTy->getScalarType(), V.getVlen())); + else + ParmTypes.push_back(ParamTy); + ++VKIt; + } + + if (V.isMasked()) { + Type *MaskVecTy = VectorType::get(CharacteristicType, V.getVlen()); + ParmTypes.push_back(MaskVecTy); + } + + FunctionType *CloneFuncType = FunctionType::get(ReturnType, ParmTypes, false); + Function *Clone = Function::Create( + CloneFuncType, GlobalValue::ExternalLinkage, VariantName, F.getParent()); + + ValueToValueMapTy Vmap; + Function::arg_iterator NewArgIt = Clone->arg_begin(); + for (auto &Arg: F.args()) { + NewArgIt->setName(Arg.getName()); + Vmap[&Arg] = &*NewArgIt; + ++NewArgIt; + } + + if (V.isMasked()) { + Argument &MaskArg = *NewArgIt; + MaskArg.setName("mask"); + } + + SmallVector Returns; + CloneFunctionInto(Clone, &F, Vmap, true, Returns); + + // Remove incompatible argument attributes (applied to the scalar argument, + // does not apply to its vector counterpart). This must be done after cloning + // the function because CloneFunctionInto() transfers parameter attributes + // from the original parameters in the Vmap. + AttrBuilder AB; + uint64_t Idx = 0; + for (auto &Arg : Clone->args()) { + Type *ArgType = Arg.getType(); + AB = AttributeFuncs::typeIncompatible(ArgType); + Clone->removeParamAttrs(Idx, AB); + ++Idx; + } + + AB = AttributeFuncs::typeIncompatible(ReturnType); + Clone->removeAttributes(AttributeList::ReturnIndex, AB); + + // Don't propagate vector variant attributes to the cloned function. These + // attributes are kept for the original function, however, because they + // are needed by the vectorizer. + Clone->removeFnAttr("vector-variants"); + + DEBUG(dbgs() << "After Cloning and Function Signature widening\n"); + DEBUG(Clone->dump()); + + return Clone; +} + +PHINode *VecClonePass::generateLoopForFunctionBody( + Function *Clone, BasicBlock *EntryBlock, BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, BasicBlock *ReturnBlock, int VectorLength) { + + // Create the phi node for the top of the loop block and add the back + // edge to the loop from the loop exit. + + PHINode *Phi = PHINode::Create(Type::getInt32Ty(Clone->getContext()), 2, + "index", &*LoopBlock->getFirstInsertionPt()); + + Constant *Inc = ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 1); + Constant *IndInit = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 0); + + Instruction *Induction = BinaryOperator::CreateAdd(Phi, Inc, "indvar"); + Induction->insertBefore(LoopExitBlock->getTerminator()); + + Constant *VL = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), VectorLength); + + Instruction *VLCmp = + new ICmpInst(CmpInst::ICMP_ULT, Induction, VL, "vl.cond"); + VLCmp->insertAfter(Induction); + + LoopExitBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(LoopBlock, ReturnBlock, VLCmp, LoopExitBlock); + + Phi->addIncoming(IndInit, EntryBlock); + Phi->addIncoming(Induction, LoopExitBlock); + + DEBUG(dbgs() << "After Loop Insertion\n"); + DEBUG(Clone->dump()); + + return Phi; +} + +bool VecClonePass::isSimpleFunction(Function *Clone, BasicBlock &EntryBlock) { + // For really simple functions, there is no need to go through the process + // of inserting a loop. + + // Example: + // + // void foo(void) { + // return; + // } + // + // No need to insert a loop for this case since it's basically a no-op. Just + // clone the function and return. It's possible that we could have some code + // inside of a vector function that modifies global memory. Let that case go + // through. + ReturnInst *RetInst = dyn_cast(EntryBlock.getTerminator()); + if (RetInst && Clone->getReturnType()->isVoidTy()) + return true; + + return false; +} + +void VecClonePass::removeIncompatibleAttributes(Function *Clone) { + for (auto &Arg : Clone->args()) { + // For functions that only have a return instruction and are not void, + // the return type is widened to vector. For this case, the returned + // attribute becomes incompatible and must be removed. + if (Clone->hasParamAttribute(Arg.getArgNo(), Attribute::Returned)) + Clone->removeParamAttr(Arg.getArgNo(), Attribute::Returned); + } +} + +void VecClonePass::insertSplitForMaskedVariant( + Function *Clone, + BasicBlock *LoopBlock, + BasicBlock *LoopExitBlock, + Instruction *Mask, PHINode *Phi) { + + BasicBlock *LoopThenBlock = + LoopBlock->splitBasicBlock(LoopBlock->getFirstNonPHI(), "simd.loop.then"); + + BasicBlock *LoopElseBlock = BasicBlock::Create( + Clone->getContext(), "simd.loop.else", Clone, LoopExitBlock); + + BranchInst::Create(LoopExitBlock, LoopElseBlock); + + BitCastInst *BitCast = dyn_cast(Mask); + PointerType *BitCastType = dyn_cast(BitCast->getType()); + Type *PointeeType = BitCastType->getElementType(); + + GetElementPtrInst *MaskGep = GetElementPtrInst::Create( + PointeeType, Mask, Phi, "mask.gep", LoopBlock->getTerminator()); + + LoadInst *MaskLoad = + new LoadInst(MaskGep, "mask.parm", LoopBlock->getTerminator()); + + Type *CompareTy = MaskLoad->getType(); + Instruction *MaskCmp; + Constant *Zero; + + // Generate the compare instruction to see if the mask bit is on. In ICC, we + // use the movemask intrinsic which takes both float/int mask registers and + // converts to an integer scalar value, one bit representing each element. + // AVR construction will be complicated if this intrinsic is introduced here, + // so the current solution is to just generate either an integer or floating + // point compare instruction for now. This may change anyway if we decide to + // go to a vector of i1 values for the mask. I suppose this would be one + // positive reason to use vector of i1. + if (CompareTy->isIntegerTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0); + MaskCmp = new ICmpInst(LoopBlock->getTerminator(), CmpInst::ICMP_NE, + MaskLoad, Zero, "mask.cond"); + } else if (CompareTy->isFloatingPointTy()) { + Zero = getConstantValue(CompareTy, Clone->getContext(), 0.0); + MaskCmp = new FCmpInst(LoopBlock->getTerminator(), CmpInst::FCMP_UNE, + MaskLoad, Zero, "mask.cond"); + } else { + assert(0 && "Unsupported mask compare"); + } + + TerminatorInst *Term = LoopBlock->getTerminator(); + Term->eraseFromParent(); + BranchInst::Create(LoopThenBlock, LoopElseBlock, MaskCmp, LoopBlock); + + DEBUG(dbgs() << "After Split Insertion For Masked Variant\n"); + DEBUG(Clone->dump()); +} + +void VecClonePass::addLoopMetadata(BasicBlock *Latch, unsigned VF) { + // This function sets the loop metadata for the new loop inserted around + // the simd function body. This metadata includes disabling unrolling just + // in case for some reason that unrolling occurs in between this pass and + // the vectorizer. Also, the loop vectorization metadata is set to try + // and force vectorization at the specified VF of the simd function. + // + // Set disable unroll metadata on the conditional branch of the loop latch + // for the simd loop. The following is an example of what the loop latch + // and Metadata will look like. The !llvm.loop marks the beginning of the + // loop Metadata and is always placed on the terminator of the loop latch. + // (i.e., simd.loop.exit in this case). According to LLVM documentation, to + // properly set the loop Metadata, the 1st operand of !16 must be a self- + // reference to avoid some type of Metadata merging conflicts that have + // apparently arisen in the past. This is part of LLVM history that I do not + // know. Also, according to LLVM documentation, any Metadata nodes referring + // to themselves are marked as distinct. As such, all Metadata corresponding + // to a loop belongs to that loop alone and no sharing of Metadata can be + // done across different loops. + // + // simd.loop.exit: ; preds = %simd.loop, %if.else, %if.then + // %indvar = add nuw i32 %index, 1 + // %vl.cond = icmp ult i32 %indvar, 2 + // br i1 %vl.cond, label %simd.loop, label %simd.end.region, !llvm.loop !16 + // + // !16 = distinct !{!16, !17} + // !17 = !{!"llvm.loop.unroll.disable"} + + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = Latch->getContext(); + SmallVector DisableOps; + DisableOps.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); + DisableOps.push_back(MDString::get(Context, "llvm.loop.vectorize.enable")); + Metadata *Vals[] = { + MDString::get(Context, "llvm.loop.vectorize.width"), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), VF))}; + DisableOps.push_back(MDNode::get(Context, Vals)); + MDNode *DisableNode = MDNode::get(Context, DisableOps); + MDs.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + Latch->getTerminator()->setMetadata("llvm.loop", NewLoopID); +} + +void VecClonePass::widenAllocaInstructions( + Function *Clone, + DenseMap &AllocaMap, + BasicBlock &EntryBlock, + VectorVariant &Variant, + const DataLayout &DL) { + + DenseMap::iterator AllocaMapIt; + SmallVector StoresToRemove; + + for (auto &Arg : Clone->args()) { + SmallVector ArgUsers; + for (auto *U : Arg.users()) { + // Only update parameter users in the loop. + if (Instruction *Inst = dyn_cast(U)) + if (Inst->getParent() != &EntryBlock) + ArgUsers.push_back(U); + } + + Type *ArgTy = Arg.getType(); + VectorType *VecArgType = dyn_cast(ArgTy); + StringRef ArgName = Arg.getName(); + for (auto *U : ArgUsers) { + // For non-optimized parameters, i.e., for parameters that are loads and + // stores through memory (allocas), we need to know which alloca belongs + // to which parameter. This can be done by finding the store of the + // parameter to an alloca. Set up a map that maintains this relationship + // so that we can update the users of the original allocas with the new + // widened ones. When widening the allocas, vector parameters will be + // stored to a vector alloca, and linear/uniform parameters will be + // stored to an array, using the loop index as the "lane". Nothing else + // needs to be done for optimized parameters. Later, this map will be + // used to update all alloca users. + StoreInst *StoreUser = dyn_cast(U); + LoadInst *LoadUser = dyn_cast(U); + AllocaInst *Alloca = nullptr; + + if (LoadUser) + Alloca = dyn_cast(LoadUser->getPointerOperand()); + + if (StoreUser) + Alloca = dyn_cast(StoreUser->getPointerOperand()); + + if (StoreUser && Alloca) { + AllocaMapIt = AllocaMap.find(Alloca); + if (AllocaMapIt == AllocaMap.end()) { + if (VecArgType) { + AllocaInst *VecAlloca = new AllocaInst( + VecArgType, DL.getAllocaAddrSpace(), "vec." + ArgName, + EntryBlock.getTerminator()); + StoreInst *VecStore = new StoreInst(&Arg, VecAlloca); + VecStore->insertAfter(VecAlloca); + PointerType *ElemTypePtr = + PointerType::get(VecArgType->getElementType(), + VecAlloca->getType()->getAddressSpace()); + BitCastInst *VecAllocaCast = new BitCastInst( + VecAlloca, ElemTypePtr, VecAlloca->getName() + ".cast"); + VecAllocaCast->insertAfter(VecStore); + AllocaMap[Alloca] = VecAllocaCast; + StoresToRemove.push_back(StoreUser); + } else { + ArrayType *ArrType = ArrayType::get(ArgTy, Variant.getVlen()); + AllocaInst *ArrAlloca = new AllocaInst( + ArrType, DL.getAllocaAddrSpace(), "arr." + ArgName, + EntryBlock.getTerminator()); + AllocaMap[Alloca] = ArrAlloca; + } + } + } + } + } + + // Remove the store of the parameter to the original alloca. A new one + // was just created for the new alloca. + for (auto *Store : StoresToRemove) + Store->eraseFromParent(); +} + +void VecClonePass::updateAllocaUsers( + Function *Clone, + PHINode *Phi, + DenseMap &AllocaMap) { + + SmallVector AllocaUsers; + for (auto Pair : AllocaMap) { + AllocaInst *OldAlloca = Pair.first; + for (auto *U : OldAlloca->users()) { + if (isa(U)) + AllocaUsers.push_back(U); + } + } + + // Update all alloca users by doing an a -> &a[i] transformation. This + // involves inserting a gep just before each use of the alloca. The only + // exception is for vector stores to an alloca. These are moved to the + // entry block of the function just after the widened alloca. + //for (unsigned j = 0; j < AllocaUsers.size(); j++) { + for (auto *U : AllocaUsers) { + unsigned NumOps = U->getNumOperands(); + for (unsigned k = 0; k < NumOps; k++) { + if (AllocaInst *OldAlloca = dyn_cast(U->getOperand(k))) { + if (AllocaInst *NewAlloca = + dyn_cast(AllocaMap[OldAlloca])) { + // If this is an alloca for a linear/uniform parameter, then insert + // a gep for the load/store and use the loop index to reference the + // proper value for each "lane". + SmallVector GepIndices; + Constant *Idx0 = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), 0); + GepIndices.push_back(Idx0); + GepIndices.push_back(Phi); + SequentialType *SeqTy = + cast(NewAlloca->getAllocatedType()); + GetElementPtrInst *AllocaGep = + GetElementPtrInst::Create(SeqTy, NewAlloca, GepIndices, + NewAlloca->getName() + ".gep"); + AllocaGep->insertBefore(cast(U)); + U->setOperand(k, AllocaGep); + } else if (BitCastInst *NewAllocaCast = + dyn_cast(AllocaMap[OldAlloca])) { + SmallVector GepIndices; + GepIndices.push_back(Phi); + GetElementPtrInst *AllocaCastGep = + GetElementPtrInst::Create(OldAlloca->getAllocatedType(), + NewAllocaCast, GepIndices, + NewAllocaCast->getName() + ".gep"); + AllocaCastGep->insertBefore(cast(U)); + U->setOperand(k, AllocaCastGep); + } else { + llvm_unreachable( + "Expected array alloca for linear/uniform parameters or a " + "cast of vector alloca for vector parameters"); + } + } + } + } +} + +void VecClonePass::updateParameterUsers(Function *Clone, VectorVariant &Variant, + BasicBlock &EntryBlock, PHINode *Phi, + const DataLayout &DL) { + + // Update non-alloca parameter users based on type of parameter. Any users of + // the parameters that are also users of an alloca will not be updated again + // here since this has already been done. + std::vector ParmKinds = Variant.getParameters(); + DenseMap VecParmCasts; + DenseMap::iterator VecParmCastsIt; + + for (auto &Arg : Clone->args()) { + SmallVector ArgUsers; + for (auto *U : Arg.users()) { + // Only update parameter users in the loop. + if (Instruction *Inst = dyn_cast(U)) + if (Inst->getParent() != &EntryBlock) + ArgUsers.push_back(U); + } + + Type *ArgTy = Arg.getType(); + unsigned ArgNo = Arg.getArgNo(); + StringRef ArgName = Arg.getName(); + VectorType *VecArgType = dyn_cast(ArgTy); + for (unsigned j = 0; j < ArgUsers.size(); j++) { + User *U = ArgUsers[j]; + if (ParmKinds[ArgNo].isVector()) { + VecParmCastsIt = VecParmCasts.find(&Arg); + if (VecParmCastsIt == VecParmCasts.end()) { + AllocaInst *VecAlloca = + new AllocaInst(VecArgType, DL.getAllocaAddrSpace(), + "vec." + ArgName, EntryBlock.getTerminator()); + StoreInst *VecStore = new StoreInst(&Arg, VecAlloca); + VecStore->insertAfter(VecAlloca); + PointerType *ElemTypePtr = + PointerType::get(VecArgType->getElementType(), + VecAlloca->getType()->getAddressSpace()); + BitCastInst *VecAllocaCast = new BitCastInst( + VecAlloca, ElemTypePtr, VecAlloca->getName() + ".cast"); + VecAllocaCast->insertAfter(VecStore); + VecParmCasts[&Arg] = VecAllocaCast; + } + GetElementPtrInst *VecAllocaCastGep = GetElementPtrInst::Create( + VecArgType->getElementType(), VecParmCasts[&Arg], Phi, + VecParmCasts[&Arg]->getName() + ".gep", cast(U)); + LoadInst *ArgElemLoad = + new LoadInst(VecAllocaCastGep, "vec." + ArgName + ".elem"); + ArgElemLoad->insertAfter(VecAllocaCastGep); + unsigned NumOps = U->getNumOperands(); + for (unsigned Op = 0; Op < NumOps; Op++) { + if (U->getOperand(Op) == &Arg) + U->setOperand(Op, ArgElemLoad); + } + } else if (ParmKinds[ArgNo].isLinear()) { + int Stride = ParmKinds[ArgNo].getStride(); + Constant *StrideConst = + ConstantInt::get(Type::getInt32Ty(Clone->getContext()), Stride); + Instruction *Mul = + BinaryOperator::CreateMul(StrideConst, Phi, "stride.mul"); + Mul->insertBefore(cast(U)); + Value *UserOp = nullptr; + if (ArgTy->isPointerTy()) { + PointerType *ParmPtrType = dyn_cast(ArgTy); + GetElementPtrInst *LinearParmGep = GetElementPtrInst::Create( + ParmPtrType->getElementType(), &Arg, Mul, ArgName + ".gep"); + LinearParmGep->insertAfter(Mul); + UserOp = LinearParmGep; + } else { + if (Mul->getType() != ArgTy) { + CastInst *MulCast = CastInst::CreateSExtOrBitCast( + Mul, ArgTy, Mul->getName() + ".cast"); + MulCast->insertAfter(Mul); + Mul = MulCast; + } + BinaryOperator *Add = + BinaryOperator::CreateAdd(&Arg, Mul, "stride.add"); + Add->insertAfter(Mul); + UserOp = Add; + } + + unsigned NumOps = U->getNumOperands(); + for (unsigned Op = 0; Op < NumOps; Op++) { + if (U->getOperand(Op) == &Arg) + U->setOperand(Op, UserOp); + } + } + } + } +} + +bool VecClonePass::runImpl(Module &M, Function &F, VectorVariant &Variant) { + + DEBUG(dbgs() << "Before SIMD Function Cloning\n"); + DEBUG(F.dump()); + DEBUG(dbgs() << "Generating variant '" << + Variant.generateFunctionName(F.getName()) << "'\n\n"); + + // Clone the original function. + Function *Clone = CloneFunction(M, F, Variant); + if (!Clone) + return false; + + BasicBlock &EntryBlock = Clone->getEntryBlock(); + if (isSimpleFunction(Clone, EntryBlock)) + return false; + + // Remove any incompatible attributes that happen as part of widening + // function vector parameters. + removeIncompatibleAttributes(Clone); + + const DataLayout &DL = Clone->getParent()->getDataLayout(); + DenseMap AllocaMap; + // Split the entry block at the beginning and create a block for the + // loop entry. + BasicBlock *LoopBlock = EntryBlock.splitBasicBlock(EntryBlock.begin(), + "simd.loop"); + + // On the split, the alloca instructions are moved into LoopBlock. Move + // them back to the entry block. + SmallVector Allocas; + SmallVector VecStores; + BasicBlock::iterator BBIt = LoopBlock->begin(); + BasicBlock::iterator BBEnd = LoopBlock->end(); + for (; BBIt != BBEnd; ++BBIt) { + if (AllocaInst *Alloca = dyn_cast(&*BBIt)) + Allocas.push_back(Alloca); + } + for (auto *Alloca : Allocas) + Alloca->moveBefore(EntryBlock.getTerminator()); + + widenAllocaInstructions(Clone, AllocaMap, EntryBlock, Variant, DL); + + // Create a vector alloca for the return. The return type of the clone + // has already been widened, so the type can be used directly. + AllocaInst *VecRetAlloca = nullptr; + Type *VecRetTy = Clone->getReturnType(); + if (!VecRetTy->isVoidTy()) { + VecRetAlloca = new AllocaInst(VecRetTy, DL.getAllocaAddrSpace(), + "vec.ret", EntryBlock.getTerminator()); + } + + // Find the basic block containing the return. We need to know where + // to replace the return instruction with a store to the return vector + // and where to split off a loop exit block containing the loop exit + // condition. + Function::iterator FuncIt = Clone->begin(); + Function::iterator FuncEnd = Clone->end(); + BasicBlock *ReturnBlock = nullptr; + Instruction *RetInst = nullptr; + unsigned NumRets = 1; + for (; FuncIt != FuncEnd; ++FuncIt) { + if (isa(FuncIt->getTerminator())) { + // TODO: Haven't yet found (or created) a test case where there are + // multiple ret instructions. Assert for now. + assert(NumRets == 1 && + "Unsupported function due to multiple return instructions"); + ReturnBlock = &*FuncIt; + RetInst = FuncIt->getTerminator(); + NumRets++; + } + } + + // Create a basic block that will contain the loop exit condition. + BasicBlock *LoopExitBlock = + ReturnBlock->splitBasicBlock(RetInst, "simd.loop.exit"); + + // Create a new return block that will contain the load of the return + // vector and the new return instruction. + BasicBlock *NewReturnBlock = + LoopExitBlock->splitBasicBlock(LoopExitBlock->getTerminator(), "return"); + + // Generate the phi for the loop index, the loop index increment, and + // loop exit condition and put these instructions in LoopExitBlock. + PHINode *Phi = generateLoopForFunctionBody(Clone, &EntryBlock, LoopBlock, + LoopExitBlock, NewReturnBlock, + Variant.getVlen()); + + // Generate the load from the return vector and new return instruction + // and put them in the new return basic block. + LoadInst *VecReturn = + new LoadInst(VecRetAlloca, "vec.ret", NewReturnBlock); + ReturnInst::Create(Clone->getContext(), VecReturn, NewReturnBlock); + + // Change the return instruction to a store to the return vector. + Value *StoreVal = RetInst->getOperand(0); + Type *StoreValTy = StoreVal->getType(); + PointerType *ElemTypePtr = + PointerType::get(StoreValTy, DL.getAllocaAddrSpace()); + BitCastInst *RetAllocaCast = new BitCastInst( + VecRetAlloca, ElemTypePtr, VecRetAlloca->getName() + ".cast"); + RetAllocaCast->insertAfter(VecRetAlloca); + GetElementPtrInst *RetAllocaCastGep = GetElementPtrInst::Create( + StoreValTy, RetAllocaCast, Phi, RetAllocaCast->getName() + ".gep"); + RetAllocaCastGep->insertBefore(ReturnBlock->getTerminator()); + StoreInst *RetStore = new StoreInst(RetInst->getOperand(0), RetAllocaCastGep); + RetStore->insertAfter(RetAllocaCastGep); + RetInst->eraseFromParent(); + + updateAllocaUsers(Clone, Phi, AllocaMap); + + updateParameterUsers(Clone, Variant, EntryBlock, Phi, DL); + + // For masked variants, create a vector mask parameter and insert the mask + // bit checks. + if (Variant.isMasked()) { + // Create a vector alloca for the mask parameter. + Function::arg_iterator MaskParam = Clone->arg_end(); + MaskParam--; + AllocaInst *MaskAlloca = new AllocaInst(MaskParam->getType(), + DL.getAllocaAddrSpace(), + "vec." + MaskParam->getName(), + EntryBlock.getTerminator()); + StoreInst *MaskStore = new StoreInst(MaskParam, MaskAlloca); + MaskStore->insertAfter(MaskAlloca); + VectorType *MaskTy = cast(MaskParam->getType()); + PointerType *ElemTypePtr = + PointerType::get(MaskTy->getElementType(), DL.getAllocaAddrSpace()); + BitCastInst *MaskAllocaCast = new BitCastInst(MaskAlloca, + ElemTypePtr,"mask.cast"); + MaskAllocaCast->insertAfter(MaskStore); + + insertSplitForMaskedVariant(Clone, LoopBlock, LoopExitBlock, + MaskAllocaCast, Phi); + } + + // Remove old allocas + for (auto Pair : AllocaMap) { + AllocaInst *OldAlloca = Pair.first; + OldAlloca->eraseFromParent(); + } + + // Prevent unrolling from kicking in before loop vectorization and force + // vectorization of the loop to the VF of the simd function. + addLoopMetadata(LoopExitBlock, Variant.getVlen()); + + DEBUG(dbgs() << "After SIMD Function Cloning\n"); + DEBUG(Clone->dump()); + + return true; // LLVM IR has been modified +} + +bool VecClone::runOnModule(Module &M) { + bool Changed = false; + FunctionVariants FunctionsToVectorize; + Impl.getFunctionsToVectorize(M, FunctionsToVectorize); + for (auto Pair : FunctionsToVectorize) { + Function &F = *(Pair.first); + std::vector Variants = Pair.second; + TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + for (auto V : Variants) { + VectorVariant Variant(V, TTI); + Changed |= Impl.runImpl(M, F, Variant); + } + } + + return Changed; +} + +PreservedAnalyses VecClonePass::run(Module &M, + ModuleAnalysisManager &AM) { + bool Changed = false; + auto &FAM = AM.getResult(M).getManager(); + FunctionVariants FunctionsToVectorize; + getFunctionsToVectorize(M, FunctionsToVectorize); + for (auto Pair : FunctionsToVectorize) { + Function &F = *(Pair.first); + std::vector Variants = Pair.second; + TargetTransformInfo *TTI = &FAM.getResult(F); + for (auto V : Variants) { + VectorVariant Variant(V, TTI); + Changed |= runImpl(M, F, Variant); + } + } + + if (Changed) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +void VecClone::print(raw_ostream &OS, const Module *M) const { + // TODO +} + +ModulePass *llvm::createVecClonePass() { return new llvm::VecClone(); } + +char VecClone::ID = 0; + +static const char lv_name[] = "VecClone"; +INITIALIZE_PASS_BEGIN(VecClone, SV_NAME, lv_name, false /* modifies CFG */, + false /* transform pass */) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(VecClone, SV_NAME, lv_name, false /* modififies CFG */, + false /* transform pass */) + Index: test/Transforms/LoopVectorize/masked_simd_func.ll =================================================================== --- test/Transforms/LoopVectorize/masked_simd_func.ll +++ test/Transforms/LoopVectorize/masked_simd_func.ll @@ -0,0 +1,107 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: call <8 x i32> @_ZGVdM8vlu_dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv39 = phi i64 [ 0, %entry ], [ %indvars.iv.next40, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv39 + %2 = trunc i64 %indvars.iv39 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1 + %exitcond41 = icmp eq i64 %indvars.iv.next40, 4096 + br i1 %exitcond41, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.inc, %for.end + %indvars.iv36 = phi i64 [ 0, %for.end ], [ %indvars.iv.next37, %omp.inner.for.inc ] + %4 = trunc i64 %indvars.iv36 to i32 + %rem = and i32 %4, 1 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %omp.inner.for.inc, label %if.then + +if.then: ; preds = %omp.inner.for.body + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv36 + %5 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %call = tail call i32 @dowork(i32 %5, i32 %4, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv36 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body, %if.then + %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1 + %exitcond38 = icmp eq i64 %indvars.iv.next37, 4096 + br i1 %exitcond38, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.inc + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/simd_func.ll =================================================================== --- test/Transforms/LoopVectorize/simd_func.ll +++ test/Transforms/LoopVectorize/simd_func.ll @@ -0,0 +1,99 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: call <8 x i32> @_ZGVdN8vlu_dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38 + %2 = trunc i64 %indvars.iv38 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096 + br i1 %exitcond40, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body, %for.end + %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ] + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35 + %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %5 = trunc i64 %indvars.iv35 to i32 + %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096 + br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.body + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN16vlu_dowork,_ZGVbM4vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM16vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/simd_func_scalar.ll =================================================================== --- test/Transforms/LoopVectorize/simd_func_scalar.ll +++ test/Transforms/LoopVectorize/simd_func_scalar.ll @@ -0,0 +1,111 @@ +; Note: Test the simd function caller side functionality. The function side vectorization is tested under VecClone. + +; RUN: opt < %s -vec-clone -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s + +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork +; CHECK: extractelement <4 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: call i32 @dowork + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define i32 @dowork(i32 %b, i32 %k, i32 %c) #0 { +entry: + %add = add nsw i32 %b, %k + %add1 = add nsw i32 %add, %c + ret i32 %add1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +entry: + %a = alloca [4096 x i32], align 16 + %b = alloca [4096 x i32], align 16 + %0 = bitcast [4096 x i32]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %0) #5 + %1 = bitcast [4096 x i32]* %b to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull %1) #5 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv38 = phi i64 [ 0, %entry ], [ %indvars.iv.next39, %for.body ] + %arrayidx = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv38 + %2 = trunc i64 %indvars.iv38 to i32 + store i32 %2, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond40 = icmp eq i64 %indvars.iv.next39, 4096 + br i1 %exitcond40, label %for.end, label %for.body + +for.end: ; preds = %for.body + %arrayidx1 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 3 + %3 = load i32, i32* %arrayidx1, align 4, !tbaa !2 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body, %for.end + %indvars.iv35 = phi i64 [ 0, %for.end ], [ %indvars.iv.next36, %omp.inner.for.body ] + %arrayidx5 = getelementptr inbounds [4096 x i32], [4096 x i32]* %b, i64 0, i64 %indvars.iv35 + %4 = load i32, i32* %arrayidx5, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %5 = trunc i64 %indvars.iv35 to i32 + %call = tail call i32 @dowork(i32 %4, i32 %5, i32 %3), !llvm.mem.parallel_loop_access !6 + %arrayidx7 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv35 + store i32 %call, i32* %arrayidx7, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond37 = icmp eq i64 %indvars.iv.next36, 4096 + br i1 %exitcond37, label %omp.inner.for.end, label %omp.inner.for.body, !llvm.loop !6 + +omp.inner.for.end: ; preds = %omp.inner.for.body + br label %for.body11 + +for.body11: ; preds = %for.body11, %omp.inner.for.end + %indvars.iv = phi i64 [ 0, %omp.inner.for.end ], [ %indvars.iv.next, %for.body11 ] + %arrayidx13 = getelementptr inbounds [4096 x i32], [4096 x i32]* %a, i64 0, i64 %indvars.iv + %6 = load i32, i32* %arrayidx13, align 4, !tbaa !2 + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %6) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end17, label %for.body11 + +for.end17: ; preds = %for.body11 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare i32 @printf(i8*, ...) #3 + +attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN8vlu_dowork,_ZGVcN8vlu_dowork,_ZGVdN8vlu_dowork,_ZGVeN8vlu_dowork,_ZGVbM8vlu_dowork,_ZGVcM8vlu_dowork,_ZGVdM8vlu_dowork,_ZGVeM8vlu_dowork" } +attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/VecClone/all_parm_types.ll =================================================================== --- test/Transforms/VecClone/all_parm_types.ll +++ test/Transforms/VecClone/all_parm_types.ll @@ -0,0 +1,46 @@ +; Test all different kinds of parameters (uniform, linear, vector), multiple uses of linear k, and that stride calculations can handle type conversions. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4uvl_dowork +; CHECK: simd.loop: +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %stride.cast{{.*}} = sext i32 %stride.mul{{.*}} +; CHECK: %stride.add{{.*}} = add i64 %k, %stride.cast{{.*}} +; CHECK: %arrayidx = getelementptr inbounds float, float* %a, i64 %stride.add{{.*}} +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %stride.cast{{.*}} = bitcast i32 %stride.mul{{.*}} to float +; CHECK: %stride.add{{.*}} = fadd float %conv, %stride.cast{{.*}} +; CHECK: %add{{.*}} = fadd float %add, %stride.add{{.*}} + +; ModuleID = 'rfc.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define float @dowork(float* %a, float %b, i64 %k) #0 { +entry: + %arrayidx = getelementptr inbounds float, float* %a, i64 %k + %0 = load float, float* %arrayidx, align 4, !tbaa !2 + %call = call float @sinf(float %0) #5 + %add = fadd float %call, %b + %conv = sitofp i64 %k to float + %add1 = fadd float %add, %conv + ret float %add1 +} + +; Function Attrs: nounwind +declare float @sinf(float) #1 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN4uvl_dowork,_ZGVcN8uvl_dowork,_ZGVdN8uvl_dowork,_ZGVeN16uvl_dowork,_ZGVbM4uvl_dowork,_ZGVcM8uvl_dowork,_ZGVdM8uvl_dowork,_ZGVeM16uvl_dowork" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (trunk 316400)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"float", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} Index: test/Transforms/VecClone/broadcast.ll =================================================================== --- test/Transforms/VecClone/broadcast.ll +++ test/Transforms/VecClone/broadcast.ll @@ -0,0 +1,19 @@ +; Check broadcast of a constant. The store of the constant should be moved inside of the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4_foo +; CHECK: simd.loop: +; CHECK: store i32 99, i32* %ret.cast.gep + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo() #0 { +entry: + ret i32 99 +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4_foo,_ZGVbN4_foo,_ZGVcM8_foo,_ZGVcN8_foo,_ZGVdM8_foo,_ZGVdN8_foo,_ZGVeM16_foo,_ZGVeN16_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/convert_linear.ll =================================================================== --- test/Transforms/VecClone/convert_linear.ll +++ test/Transforms/VecClone/convert_linear.ll @@ -0,0 +1,32 @@ +; Check handling of upconverting a linear (variable %i) to ensure stride calculation +; is inserted correctly and the old convert (sext) uses the stride instead of the old +; reference to %i. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN2vl_foo +; CHECK: simd.loop: +; CHECK: %0 = load i32, i32* %i.addr +; CHECK-NEXT: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %0, %stride.mul +; CHECK-NEXT: %conv = sext i32 %stride.add to i64 + +; ModuleID = 'convert.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i64 @foo(i64 %x, i32 %i) #0 { +entry: + %x.addr = alloca i64, align 8 + %i.addr = alloca i32, align 4 + store i64 %x, i64* %x.addr, align 8 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %conv = sext i32 %0 to i64 + %1 = load i64, i64* %x.addr, align 8 + %add = add nsw i64 %conv, %1 + ret i64 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM2vl_foo,_ZGVbN2vl_foo,_ZGVcM4vl_foo,_ZGVcN4vl_foo,_ZGVdM4vl_foo,_ZGVdN4vl_foo,_ZGVeM8vl_foo,_ZGVeN8vl_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/external_array.ll =================================================================== --- test/Transforms/VecClone/external_array.ll +++ test/Transforms/VecClone/external_array.ll @@ -0,0 +1,35 @@ +; Check to see that we are applying the correct updated linear index for an external array access gep. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4ul_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %idxprom = sext i32 %stride.add to i64 +; CHECK: %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom +; CHECK: store i32 %0, i32* %arrayidx + +; ModuleID = 'external_array_assign.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@ext_a = common global [128 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @foo(i32 %x, i32 %i) #0 { +entry: + %x.addr = alloca i32, align 4 + %i.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @ext_a, i64 0, i64 %idxprom + store i32 %0, i32* %arrayidx, align 4 + ret void +} + +attributes #0 = { norecurse nounwind uwtable "vector-variants"="_ZGVbM4ul_foo,_ZGVbN4ul_foo,_ZGVcM8ul_foo,_ZGVcN8ul_foo,_ZGVdM8ul_foo,_ZGVdN8ul_foo,_ZGVeM16ul_foo,_ZGVeN16ul_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear.ll =================================================================== --- test/Transforms/VecClone/linear.ll +++ test/Transforms/VecClone/linear.ll @@ -0,0 +1,29 @@ +; Check to see that the linear parameter i is updated with the correct stride, indicated by a mul/add instruction sequence after the load. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %1 = load i32, i32* %i.addr +; CHECK: %stride.mul = mul i32 1, %index +; CHECK: %stride.add = add i32 %1, %stride.mul +; CHECK: %add = add nsw i32 %0, %stride.add + +; ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %i.addr = alloca i32, align 4 + %x.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %x, i32* %x.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %i.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/linear_mem2reg.ll =================================================================== --- test/Transforms/VecClone/linear_mem2reg.ll +++ test/Transforms/VecClone/linear_mem2reg.ll @@ -0,0 +1,22 @@ +; Check to see that the linear parameter i is updated with the correct stride when Mem2Reg is on. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4lu_foo +; CHECK: simd.loop: +; CHECK: %stride.mul = mul i32 1, %index +; CHECK-NEXT: %stride.add = add i32 %i, %stride.mul +; CHECK-NEXT: %add = add nsw i32 %x, %stride.add + +;ModuleID = 'linear.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %i, i32 %x) #0 { +entry: + %add = add nsw i32 %x, %i + ret i32 %add +} + +attributes #0 = { norecurse nounwind readnone uwtable "vector-variants"="_ZGVbM4lu_foo,_ZGVbN4lu_foo,_ZGVcM8lu_foo,_ZGVcN8lu_foo,_ZGVdM8lu_foo,_ZGVdN8lu_foo,_ZGVeM16lu_foo,_ZGVeN16lu_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/struct_linear_ptr.ll =================================================================== --- test/Transforms/VecClone/struct_linear_ptr.ll +++ test/Transforms/VecClone/struct_linear_ptr.ll @@ -0,0 +1,40 @@ +; Test that the stride is being applied correctly to struct field accesses. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN4l_foo +; CHECK: simd.loop: +; CHECK: %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %0, i32 %stride.mul{{.*}} +; CHECK: %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 0 +; CHECK: %1 = load float, float* %field1, align 8 +; CHECK: %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 +; CHECK: %stride.mul{{.*}} = mul i32 1, %index +; CHECK: %s.addr.gep{{.*}} = getelementptr %struct.my_struct, %struct.my_struct* %2, i32 %stride.mul{{.*}} +; CHECK: %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %s.addr.gep{{.*}}, i32 0, i32 4 +; CHECK: %3 = load float, float* %field5, align 8 +; CHECK: %add = fadd float %1, %3 + +; ModuleID = 'struct_linear_ptr.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.my_struct = type { float, i8, i32, i16, float, i64 } + +; Function Attrs: nounwind uwtable +define float @foo(%struct.my_struct* %s) #0 { +entry: + %s.addr = alloca %struct.my_struct*, align 8 + store %struct.my_struct* %s, %struct.my_struct** %s.addr, align 8 + %0 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field1 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %0, i32 0, i32 0 + %1 = load float, float* %field1, align 8 + %2 = load %struct.my_struct*, %struct.my_struct** %s.addr, align 8 + %field5 = getelementptr inbounds %struct.my_struct, %struct.my_struct* %2, i32 0, i32 4 + %3 = load float, float* %field5, align 8 + %add = fadd float %1, %3 + ret float %add +} + +attributes #0 = { norecurse nounwind readonly uwtable "vector-variants"="_ZGVbM4l_foo,_ZGVbN4l_foo,_ZGVcM8l_foo,_ZGVcN8l_foo,_ZGVdM8l_foo,_ZGVdN8l_foo,_ZGVeM16l_foo,_ZGVeN16l_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum.ll +++ test/Transforms/VecClone/two_vec_sum.ll @@ -0,0 +1,59 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the non-masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; CHECK-LABEL: <4 x i32> @_ZGVbN4vv_vec_sum(<4 x i32> %i, <4 x i32> %j) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mask.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mask.ll +++ test/Transforms/VecClone/two_vec_sum_mask.ll @@ -0,0 +1,71 @@ +; Do a sanity check on the structure of the LLVM that VecClone produces for the masked variant. + +; RUN: opt -vec-clone -S < %s | FileCheck %s +; NOTE: This test checks order very strictly and can change depending on optimization level used. +; FYI, the IR here was generated using -O0 in the event an issue needs to be reproduced. + +; Begin non-masked variant checking + +; CHECK-LABEL: <4 x i32> @_ZGVbM4vv_vec_sum(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask) +; CHECK-NEXT: entry: +; CHECK-NEXT: %vec.i = alloca <4 x i32> +; CHECK-NEXT: %vec.j = alloca <4 x i32> +; CHECK-NEXT: %vec.mask = alloca <4 x i32> +; CHECK-NEXT: %vec.retval = alloca <4 x i32> +; CHECK-NEXT: store <4 x i32> %i, <4 x i32>* %vec.i, align 4 +; CHECK-NEXT: store <4 x i32> %j, <4 x i32>* %vec.j, align 4 +; CHECK-NEXT: store <4 x i32> %mask, <4 x i32>* %vec.mask +; CHECK-NEXT: %vec.i.cast = bitcast <4 x i32>* %vec.i to i32* +; CHECK-NEXT: %vec.j.cast = bitcast <4 x i32>* %vec.j to i32* +; CHECK-NEXT: %ret.cast = bitcast <4 x i32>* %vec.retval to i32* +; CHECK-NEXT: %mask.cast = bitcast <4 x i32>* %vec.mask to i32* +; CHECK-NEXT: br label %simd.loop + +; CHECK: simd.loop: +; CHECK-NEXT: %index = phi i32 [ 0, %entry ], [ %indvar, %simd.loop.exit ] +; CHECK-NEXT: %mask.gep = getelementptr i32, i32* %mask.cast, i32 %index +; CHECK-NEXT: %mask.parm = load i32, i32* %mask.gep +; CHECK-NEXT: %mask.cond = icmp ne i32 %mask.parm, 0 +; CHECK-NEXT: br i1 %mask.cond, label %simd.loop.then, label %simd.loop.else + +; CHECK: simd.loop.then: +; CHECK-NEXT: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK-NEXT: %0 = load i32, i32* %vec.i.cast.gep, align 4 +; CHECK-NEXT: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK-NEXT: %1 = load i32, i32* %vec.j.cast.gep, align 4 +; CHECK-NEXT: %add = add nsw i32 %0, %1 +; CHECK-NEXT: %ret.cast.gep = getelementptr i32, i32* %ret.cast, i32 %index +; CHECK-NEXT: store i32 %add, i32* %ret.cast.gep +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.else: +; CHECK-NEXT: br label %simd.loop.exit + +; CHECK: simd.loop.exit: +; CHECK-NEXT: %indvar = add nuw i32 %index, 1 +; CHECK-NEXT: %vl.cond = icmp ult i32 %indvar, 4 +; CHECK-NEXT: br i1 %vl.cond, label %simd.loop, label %return + +; CHECK: return: +; CHECK-NEXT: %vec.ret.cast = bitcast i32* %ret.cast to <4 x i32>* +; CHECK-NEXT: %vec.ret = load <4 x i32>, <4 x i32>* %vec.ret.cast +; CHECK-NEXT: ret <4 x i32> %vec.ret + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + %0 = load i32, i32* %i.addr, align 4 + %1 = load i32, i32* %j.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/two_vec_sum_mem2reg.ll =================================================================== --- test/Transforms/VecClone/two_vec_sum_mem2reg.ll +++ test/Transforms/VecClone/two_vec_sum_mem2reg.ll @@ -0,0 +1,31 @@ +; Check to be sure that when Mem2Reg is on that all updates to instructions referring to the original +; parameter are updated correctly. When Mem2Reg is on, instructions will refer to the parameters +; directly and not through a load, which is why this is tested separately. + +; Note: the LLVM IR used as input to this test has already had Mem2Reg applied to it, so no need to +; do that here. This happens at higher optimization levels such as -O2. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; Begin non-masked variant checking + +; CHECK-LABEL: @_ZGVbN4vv_vec_sum +; CHECK: simd.loop: +; CHECK: %vec.i.cast.gep = getelementptr i32, i32* %vec.i.cast, i32 %index +; CHECK: %vec.i.elem = load i32, i32* %vec.i.cast.gep +; CHECK: %vec.j.cast.gep = getelementptr i32, i32* %vec.j.cast, i32 %index +; CHECK: %vec.j.elem = load i32, i32* %vec.j.cast.gep +; CHECK: %add = add nsw i32 %vec.i.elem, %vec.j.elem + +; ModuleID = 'two_vec_sum.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @vec_sum(i32 %i, i32 %j) #0 { +entry: + %add = add nsw i32 %i, %j + ret i32 %add +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4vv_vec_sum,_ZGVbN4vv_vec_sum,_ZGVcM8vv_vec_sum,_ZGVcN8vv_vec_sum,_ZGVdM8vv_vec_sum,_ZGVdN8vv_vec_sum,_ZGVeM16vv_vec_sum,_ZGVeN16vv_vec_sum" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/uniform.ll =================================================================== --- test/Transforms/VecClone/uniform.ll +++ test/Transforms/VecClone/uniform.ll @@ -0,0 +1,25 @@ +; Check to make sure the initial parameter store of the uniform parameter is sunk into the loop. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: <4 x i32> @_ZGVbN4u_foo(i32 %b) +; CHECK: simd.loop: +; CHECK: store i32 %b + +; ModuleID = 'uniform.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %b) #0 { +entry: + %b.addr = alloca i32, align 4 + store i32 %b, i32* %b.addr, align 4 + %0 = load i32, i32* %b.addr, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %b.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + ret i32 %1 +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4u_foo,_ZGVbN4u_foo,_ZGVcM8u_foo,_ZGVcN8u_foo,_ZGVdM8u_foo,_ZGVdN8u_foo,_ZGVeM16u_foo,_ZGVeN16u_foo" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/VecClone/vector_ptr.ll =================================================================== --- test/Transforms/VecClone/vector_ptr.ll +++ test/Transforms/VecClone/vector_ptr.ll @@ -0,0 +1,25 @@ +; Test that vector of pointers are handled with correctly in loop and that incompatible function return/arg attributes are removed. + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: @_ZGVbN2v_dowork +; CHECK: simd.loop: +; CHECK: %vec.p.cast.gep = getelementptr float*, float** %vec.p.cast, i32 %index +; CHECK: %vec.p.elem = load float*, float** %vec.p.cast.gep +; CHECK: %add.ptr = getelementptr inbounds float, float* %vec.p.elem, i64 1 +; CHECK: %ret.cast.gep = getelementptr float*, float** %ret.cast, i32 %index +; CHECK: store float* %add.ptr, float** %ret.cast.gep + +source_filename = "vector_ptr.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readnone uwtable +define nonnull float* @dowork(float* readnone %p) local_unnamed_addr #0 { +entry: + %add.ptr = getelementptr inbounds float, float* %p, i64 1 + ret float* %add.ptr +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "vector-variants"="_ZGVbN2v_dowork,_ZGVcN4v_dowork,_ZGVdN4v_dowork,_ZGVeN8v_ +dowork,_ZGVbM2v_dowork,_ZGVcM4v_dowork,_ZGVdM4v_dowork,_ZGVeM8v_dowork" } Index: test/Transforms/VecClone/void_foo.ll =================================================================== --- test/Transforms/VecClone/void_foo.ll +++ test/Transforms/VecClone/void_foo.ll @@ -0,0 +1,19 @@ +; Check to make sure we can handle void foo() function + +; RUN: opt -vec-clone -S < %s | FileCheck %s + +; CHECK-LABEL: void @_ZGVbN4_foo() +; CHECK: entry: +; CHECK: ret void + +; ModuleID = 'foo.c' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @foo() #0 { +entry: + ret void +} + +attributes #0 = { nounwind uwtable "vector-variants"="_ZGVbM4_foo1,_ZGVbN4_foo1,_ZGVcM8_foo1,_ZGVcN8_foo1,_ZGVdM8_foo1,_ZGVdN8_foo1,_ZGVeM16_foo1,_ZGVeN16_foo1" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: tools/bugpoint/bugpoint.cpp =================================================================== --- tools/bugpoint/bugpoint.cpp +++ tools/bugpoint/bugpoint.cpp @@ -134,6 +134,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); #ifdef LINK_POLLY_INTO_TOOLS polly::initializePollyPasses(Registry); Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -389,6 +389,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializeVecClonePass(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. initializeScalarizeMaskedMemIntrinPass(Registry);