Index: include/llvm-c/Transforms/Scalar.h =================================================================== --- include/llvm-c/Transforms/Scalar.h +++ include/llvm-c/Transforms/Scalar.h @@ -41,6 +41,9 @@ /** See llvm::createDeadStoreEliminationPass function. */ void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM); +/** See llvm::createScalarizerPass function. */ +void LLVMAddScalarizerPass(LLVMPassManagerRef PM); + /** See llvm::createGVNPass function. */ void LLVMAddGVNPass(LLVMPassManagerRef PM); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -120,6 +120,7 @@ void initializeMemorySanitizerPass(PassRegistry&); void initializeThreadSanitizerPass(PassRegistry&); void initializeDataFlowSanitizerPass(PassRegistry&); +void initializeScalarizerPass(PassRegistry&); void initializeEarlyCSEPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); void initializeFindUsedTypesPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -153,6 +153,7 @@ (void) llvm::createSLPVectorizerPass(); (void) llvm::createBBVectorizePass(); (void) llvm::createPartiallyInlineLibCallsPass(); + (void) llvm::createScalarizerPass(); (void)new llvm::IntervalPartition(); (void)new llvm::FindUsedTypes(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -364,6 +364,12 @@ FunctionPass *createSampleProfileLoaderPass(); FunctionPass *createSampleProfileLoaderPass(StringRef Name); +//===----------------------------------------------------------------------===// +// +// ScalarizerPass - Converts vector operations into scalar operations +// +FunctionPass *createScalarizerPass(); + } // End llvm namespace #endif Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -5,6 +5,7 @@ CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp + Scalarizer.cpp EarlyCSE.cpp GlobalMerge.cpp GVN.cpp Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -34,6 +34,7 @@ initializeCorrelatedValuePropagationPass(Registry); initializeDCEPass(Registry); initializeDeadInstEliminationPass(Registry); + initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); initializeEarlyCSEPass(Registry); @@ -80,6 +81,10 @@ unwrap(PM)->add(createDeadStoreEliminationPass()); } +void LLVMAddScalarizerPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarizerPass()); +} + void LLVMAddGVNPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createGVNPass()); } Index: lib/Transforms/Scalar/Scalarizer.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/Scalarizer.cpp @@ -0,0 +1,637 @@ +//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts vector operations into scalar operations, in order +// to expose optimization opportunities on the individual scalar operations. +// It is mainly intended for targets that do not have vector units, but it +// may also be useful for revectorizing code to different vector widths. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "scalarizer" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +namespace { +// Used to store the scattered form of a vector. +typedef SmallVector ValueVector; + +// Used to map a vector Value to its scattered form. We use std::map +// because we want iterators to persist across insertion and because the +// values are relatively large. +typedef std::map ScatterMap; + +// Lists Instructions that have been replaced with scalar implementations, +// along with a pointer to their scattered forms. +typedef SmallVector, 16> GatherList; + +// Provides a very limited vector-like interface for lazily accessing one +// component of a scattered vector or vector pointer. +class Scatterer { +public: + // Scatter V into Size components. If new instructions are needed, + // insert them before BBI in BB. If Cache is nonnull, use it to cache + // the results. + Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr = 0); + + // Return component I, creating a new Value for it if necessary. + Value *operator[](unsigned I); + + // Return the number of components. + unsigned size() const { return Size; } + +private: + BasicBlock *BB; + BasicBlock::iterator BBI; + Value *V; + ValueVector *CachePtr; + PointerType *PtrTy; + ValueVector Tmp; + unsigned Size; +}; + +// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp +// called Name that compares X and Y in the same way as FCI. +struct FCmpSplitter { + FCmpSplitter(FCmpInst &fci) : FCI(fci) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); + } + FCmpInst &FCI; +}; + +// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp +// called Name that compares X and Y in the same way as ICI. +struct ICmpSplitter { + ICmpSplitter(ICmpInst &ici) : ICI(ici) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); + } + ICmpInst &ICI; +}; + +// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create +// a binary operator like BO called Name with operands X and Y. +struct BinarySplitter { + BinarySplitter(BinaryOperator &bo) : BO(bo) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); + } + BinaryOperator &BO; +}; + +// GEPSpliiter()(Builder, X, Y, Name) uses Builder to create +// a single GEP called Name with operands X and Y. +struct GEPSplitter { + GEPSplitter() {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateGEP(Op0, Op1, Name); + } +}; + +// Information about a load or store that we're scalarizing. +struct VectorLayout { + VectorLayout() : VecTy(0), ElemTy(0), VecAlign(0), ElemSize(0) {} + + // Return the alignment of element I. + uint64_t getElemAlign(unsigned I) { + return MinAlign(VecAlign, I * ElemSize); + } + + // The type of the vector. + VectorType *VecTy; + + // The type of each element. + Type *ElemTy; + + // The alignment of the vector. + uint64_t VecAlign; + + // The size of each element. + uint64_t ElemSize; +}; + +class Scalarizer : public FunctionPass, + public InstVisitor { +public: + static char ID; + + Scalarizer() : + FunctionPass(ID) { + initializeScalarizerPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + // InstVisitor methods. They return true if the instruction was scalarized, + // false if nothing changed. + bool visitInstruction(Instruction &) { return false; } + bool visitSelectInst(SelectInst &SI); + bool visitICmpInst(ICmpInst &); + bool visitFCmpInst(FCmpInst &); + bool visitBinaryOperator(BinaryOperator &); + bool visitGetElementPtrInst(GetElementPtrInst &); + bool visitCastInst(CastInst &); + bool visitBitCastInst(BitCastInst &); + bool visitShuffleVectorInst(ShuffleVectorInst &); + bool visitPHINode(PHINode &); + bool visitLoadInst(LoadInst &); + bool visitStoreInst(StoreInst &); + +private: + Scatterer scatter(Instruction *, Value *); + void gather(Instruction *, const ValueVector &); + bool canTransferMetadata(unsigned Kind); + void transferMetadata(Instruction *, const ValueVector &); + bool getVectorLayout(Type *, unsigned, VectorLayout &); + bool finish(); + + template bool splitBinary(Instruction &, const T &); + + ScatterMap Scattered; + GatherList Gathered; + unsigned ParallelLoopAccessMDKind; + const DataLayout *TDL; +}; + +char Scalarizer::ID = 0; +} // end anonymous namespace + +// This is disabled by default because having separate loads and stores makes +// it more likely that the -combiner-alias-analysis limits will be reached. +static cl::opt ScalarizeLoadStore + ("scalarize-load-store", cl::Hidden, cl::init(false), + cl::desc("Allow the scalarizer pass to scalarize loads and store")); + +INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations", + false, false) + +Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr) + : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) { + Type *Ty = V->getType(); + PtrTy = dyn_cast(Ty); + if (PtrTy) + Ty = PtrTy->getElementType(); + Size = Ty->getVectorNumElements(); + if (!CachePtr) + Tmp.resize(Size, 0); + else if (CachePtr->empty()) + CachePtr->resize(Size, 0); + else + assert(Size == CachePtr->size() && "Inconsistent vector sizes"); +} + +// Return component I, creating a new Value for it if necessary. +Value *Scatterer::operator[](unsigned I) { + ValueVector &CV = (CachePtr ? *CachePtr : Tmp); + // Try to reuse a previous value. + if (CV[I]) + return CV[I]; + IRBuilder<> Builder(BB, BBI); + if (PtrTy) { + if (!CV[0]) { + Type *Ty = + PointerType::get(PtrTy->getElementType()->getVectorElementType(), + PtrTy->getAddressSpace()); + CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0"); + } + if (I != 0) + CV[I] = Builder.CreateConstGEP1_32(CV[0], I, + V->getName() + ".i" + Twine(I)); + } else { + // Search through a chain of InsertElementInsts looking for element I. + // Record other elements in the cache. The new V is still suitable + // for all uncached indices. + for (;;) { + InsertElementInst *Insert = dyn_cast(V); + if (!Insert) + break; + ConstantInt *Idx = dyn_cast(Insert->getOperand(2)); + if (!Idx) + break; + unsigned J = Idx->getZExtValue(); + CV[J] = Insert->getOperand(1); + V = Insert->getOperand(0); + if (I == J) + return CV[J]; + } + CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I), + V->getName() + ".i" + Twine(I)); + } + return CV[I]; +} + +bool Scalarizer::doInitialization(Module &M) { + ParallelLoopAccessMDKind = + M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + return false; +} + +bool Scalarizer::runOnFunction(Function &F) { + TDL = getAnalysisIfAvailable(); + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { + BasicBlock *BB = BBI; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { + Instruction *I = II; + bool Done = visit(I); + ++II; + if (Done && I->getType()->isVoidTy()) + I->eraseFromParent(); + } + } + return finish(); +} + +// Return a scattered form of V that can be accessed by Point. V must be a +// vector or a pointer to a vector. +Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { + if (Argument *VArg = dyn_cast(V)) { + // Put the scattered form of arguments in the entry block, + // so that it can be used everywhere. + Function *F = VArg->getParent(); + BasicBlock *BB = &F->getEntryBlock(); + return Scatterer(BB, BB->begin(), V, &Scattered[V]); + } + if (Instruction *VOp = dyn_cast(V)) { + // Put the scattered form of an instruction directly after the + // instruction. + BasicBlock *BB = VOp->getParent(); + return Scatterer(BB, llvm::next(BasicBlock::iterator(VOp)), + V, &Scattered[V]); + } + // In the fallback case, just put the scattered before Point and + // keep the result local to Point. + return Scatterer(Point->getParent(), Point, V); +} + +// Replace Op with the gathered form of the components in CV. Defer the +// deletion of Op and creation of the gathered form to the end of the pass, +// so that we can avoid creating the gathered form if all uses of Op are +// replaced with uses of CV. +void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { + // Since we're not deleting Op yet, stub out its operands, so that it + // doesn't make anything live unnecessarily. + for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) + Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType())); + + transferMetadata(Op, CV); + + // If we already have a scattered form of Op (created from ExtractElements + // of Op itself), replace them with the new form. + ValueVector &SV = Scattered[Op]; + if (!SV.empty()) { + for (unsigned I = 0, E = SV.size(); I != E; ++I) { + Instruction *Old = cast(SV[I]); + CV[I]->takeName(Old); + Old->replaceAllUsesWith(CV[I]); + Old->eraseFromParent(); + } + } + SV = CV; + Gathered.push_back(GatherList::value_type(Op, &SV)); +} + +// Return true if it is safe to transfer the given metadata tag from +// vector to scalar instructions. +bool Scalarizer::canTransferMetadata(unsigned Tag) { + return (Tag == LLVMContext::MD_tbaa + || Tag == LLVMContext::MD_fpmath + || Tag == LLVMContext::MD_tbaa_struct + || Tag == LLVMContext::MD_invariant_load + || Tag == ParallelLoopAccessMDKind); +} + +// Transfer metadata from Op to the instructions in CV if it is known +// to be safe to do so. +void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { + SmallVector, 4> MDs; + Op->getAllMetadataOtherThanDebugLoc(MDs); + for (unsigned I = 0, E = CV.size(); I != E; ++I) { + if (Instruction *New = dyn_cast(CV[I])) { + for (SmallVectorImpl >::iterator + MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) + if (canTransferMetadata(MI->first)) + New->setMetadata(MI->first, MI->second); + New->setDebugLoc(Op->getDebugLoc()); + } + } +} + +// Try to fill in Layout from Ty, returning true on success. Alignment is +// the alignment of the vector, or 0 if the ABI default should be used. +bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, + VectorLayout &Layout) { + if (!TDL) + return false; + + // Make sure we're dealing with a vector. + Layout.VecTy = dyn_cast(Ty); + if (!Layout.VecTy) + return false; + + // Check that we're dealing with full-byte elements. + Layout.ElemTy = Layout.VecTy->getElementType(); + if (TDL->getTypeSizeInBits(Layout.ElemTy) != + TDL->getTypeStoreSizeInBits(Layout.ElemTy)) + return false; + + if (Alignment) + Layout.VecAlign = Alignment; + else + Layout.VecAlign = TDL->getABITypeAlignment(Layout.VecTy); + Layout.ElemSize = TDL->getTypeStoreSize(Layout.ElemTy); + return true; +} + +// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) +// to create an instruction like I with operands X and Y and name Name. +template +bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { + VectorType *VT = dyn_cast(I.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(I.getParent(), &I); + Scatterer Op0 = scatter(&I, I.getOperand(0)); + Scatterer Op1 = scatter(&I, I.getOperand(1)); + assert(Op0.size() == NumElems && "Mismatched binary operation"); + assert(Op1.size() == NumElems && "Mismatched binary operation"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned Elem = 0; Elem < NumElems; ++Elem) + Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem], + I.getName() + ".i" + Twine(Elem)); + gather(&I, Res); + return true; +} + +bool Scalarizer::visitSelectInst(SelectInst &SI) { + VectorType *VT = dyn_cast(SI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Op1 = scatter(&SI, SI.getOperand(1)); + Scatterer Op2 = scatter(&SI, SI.getOperand(2)); + assert(Op1.size() == NumElems && "Mismatched select"); + assert(Op2.size() == NumElems && "Mismatched select"); + ValueVector Res; + Res.resize(NumElems); + + if (SI.getOperand(0)->getType()->isVectorTy()) { + Scatterer Op0 = scatter(&SI, SI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched select"); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } else { + Value *Op0 = SI.getOperand(0); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } + gather(&SI, Res); + return true; +} + +bool Scalarizer::visitICmpInst(ICmpInst &ICI) { + return splitBinary(ICI, ICmpSplitter(ICI)); +} + +bool Scalarizer::visitFCmpInst(FCmpInst &FCI) { + return splitBinary(FCI, FCmpSplitter(FCI)); +} + +bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) { + return splitBinary(BO, BinarySplitter(BO)); +} + +bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { + return splitBinary(GEPI, GEPSplitter()); +} + +bool Scalarizer::visitCastInst(CastInst &CI) { + VectorType *VT = dyn_cast(CI.getDestTy()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(CI.getParent(), &CI); + Scatterer Op0 = scatter(&CI, CI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched cast"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(), + CI.getName() + ".i" + Twine(I)); + gather(&CI, Res); + return true; +} + +bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { + VectorType *DstVT = dyn_cast(BCI.getDestTy()); + VectorType *SrcVT = dyn_cast(BCI.getSrcTy()); + if (!DstVT || !SrcVT) + return false; + + unsigned DstNumElems = DstVT->getNumElements(); + unsigned SrcNumElems = SrcVT->getNumElements(); + IRBuilder<> Builder(BCI.getParent(), &BCI); + Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); + ValueVector Res; + Res.resize(DstNumElems); + + if (DstNumElems == SrcNumElems) { + for (unsigned I = 0; I < DstNumElems; ++I) + Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(), + BCI.getName() + ".i" + Twine(I)); + } else if (DstNumElems > SrcNumElems) { + // -> . Convert each t1 to and copy the + // individual elements to the destination. + unsigned FanOut = DstNumElems / SrcNumElems; + Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut); + unsigned ResI = 0; + for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) { + Value *V = Op0[Op0I]; + Instruction *VI; + // Look through any existing bitcasts before converting to . + // In the best case, the resulting conversion might be a no-op. + while ((VI = dyn_cast(V)) && + VI->getOpcode() == Instruction::BitCast) + V = VI->getOperand(0); + V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast"); + Scatterer Mid = scatter(&BCI, V); + for (unsigned MidI = 0; MidI < FanOut; ++MidI) + Res[ResI++] = Mid[MidI]; + } + } else { + // -> . Convert each group of into a t2. + unsigned FanIn = SrcNumElems / DstNumElems; + Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn); + unsigned Op0I = 0; + for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) { + Value *V = UndefValue::get(MidTy); + for (unsigned MidI = 0; MidI < FanIn; ++MidI) + V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI), + BCI.getName() + ".i" + Twine(ResI) + + ".upto" + Twine(MidI)); + Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(), + BCI.getName() + ".i" + Twine(ResI)); + } + } + gather(&BCI, Res); + return true; +} + +bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + VectorType *VT = dyn_cast(SVI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + Scatterer Op0 = scatter(&SVI, SVI.getOperand(0)); + Scatterer Op1 = scatter(&SVI, SVI.getOperand(1)); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) { + int Selector = SVI.getMaskValue(I); + if (Selector < 0) + Res[I] = UndefValue::get(VT->getElementType()); + else if (unsigned(Selector) < Op0.size()) + Res[I] = Op0[Selector]; + else + Res[I] = Op1[Selector - Op0.size()]; + } + gather(&SVI, Res); + return true; +} + +bool Scalarizer::visitPHINode(PHINode &PHI) { + VectorType *VT = dyn_cast(PHI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(PHI.getParent(), &PHI); + ValueVector Res; + Res.resize(NumElems); + + unsigned NumOps = PHI.getNumOperands(); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps, + PHI.getName() + ".i" + Twine(I)); + + for (unsigned I = 0; I < NumOps; ++I) { + Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I)); + BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); + for (unsigned J = 0; J < NumElems; ++J) + cast(Res[J])->addIncoming(Op[J], IncomingBlock); + } + gather(&PHI, Res); + return true; +} + +bool Scalarizer::visitLoadInst(LoadInst &LI) { + if (!ScalarizeLoadStore) + return false; + if (!LI.isSimple()) + return false; + + VectorLayout Layout; + if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(LI.getParent(), &LI); + Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I), + LI.getName() + ".i" + Twine(I)); + gather(&LI, Res); + return true; +} + +bool Scalarizer::visitStoreInst(StoreInst &SI) { + if (!ScalarizeLoadStore) + return false; + if (!SI.isSimple()) + return false; + + VectorLayout Layout; + Value *FullValue = SI.getValueOperand(); + if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); + Scatterer Val = scatter(&SI, FullValue); + + ValueVector Stores; + Stores.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) { + unsigned Align = Layout.getElemAlign(I); + Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align); + } + transferMetadata(&SI, Stores); + return true; +} + +// Delete the instructions that we scalarized. If a full vector result +// is still needed, recreate it using InsertElements. +bool Scalarizer::finish() { + if (Gathered.empty()) + return false; + for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); + GMI != GME; ++GMI) { + Instruction *Op = GMI->first; + ValueVector &CV = *GMI->second; + if (!Op->use_empty()) { + // The value is still needed, so recreate it using a series of + // InsertElements. + Type *Ty = Op->getType(); + Value *Res = UndefValue::get(Ty); + unsigned Count = Ty->getVectorNumElements(); + IRBuilder<> Builder(Op->getParent(), Op); + for (unsigned I = 0; I < Count; ++I) + Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I), + Op->getName() + ".upto" + Twine(I)); + Res->takeName(Op); + Op->replaceAllUsesWith(Res); + } + Op->eraseFromParent(); + } + Gathered.clear(); + Scattered.clear(); + return true; +} + +FunctionPass *llvm::createScalarizerPass() { + return new Scalarizer(); +} Index: test/Transforms/Scalarizer/basic.ll =================================================================== --- /dev/null +++ test/Transforms/Scalarizer/basic.ll @@ -0,0 +1,390 @@ +; RUN: opt %s -scalarizer -scalarize-load-store -dce -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +declare <4 x float> @ext(<4 x float>) +@g = global <4 x float> zeroinitializer + +define void @f1(<4 x float> %init, <4 x float> *%base, i32 %count) { +; CHECK-LABEL: @f1( +; CHECK: entry: +; CHECK: %init.i0 = extractelement <4 x float> %init, i32 0 +; CHECK: %init.i1 = extractelement <4 x float> %init, i32 1 +; CHECK: %init.i2 = extractelement <4 x float> %init, i32 2 +; CHECK: %init.i3 = extractelement <4 x float> %init, i32 3 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %i = phi i32 [ %count, %entry ], [ %nexti, %loop ] +; CHECK: %acc.i0 = phi float [ %init.i0, %entry ], [ %sel.i0, %loop ] +; CHECK: %acc.i1 = phi float [ %init.i1, %entry ], [ %sel.i1, %loop ] +; CHECK: %acc.i2 = phi float [ %init.i2, %entry ], [ %sel.i2, %loop ] +; CHECK: %acc.i3 = phi float [ %init.i3, %entry ], [ %sel.i3, %loop ] +; CHECK: %nexti = sub i32 %i, 1 +; CHECK: %ptr = getelementptr <4 x float>* %base, i32 %i +; CHECK: %ptr.i0 = bitcast <4 x float>* %ptr to float* +; CHECK: %val.i0 = load float* %ptr.i0, align 16 +; CHECK: %ptr.i1 = getelementptr float* %ptr.i0, i32 1 +; CHECK: %val.i1 = load float* %ptr.i1, align 4 +; CHECK: %ptr.i2 = getelementptr float* %ptr.i0, i32 2 +; CHECK: %val.i2 = load float* %ptr.i2, align 8 +; CHECK: %ptr.i3 = getelementptr float* %ptr.i0, i32 3 +; CHECK: %val.i3 = load float* %ptr.i3, align 4 +; CHECK: %add.i0 = fadd float %val.i0, %val.i2 +; CHECK: %add.i1 = fadd float %val.i1, %val.i3 +; CHECK: %add.i2 = fadd float %acc.i0, %acc.i2 +; CHECK: %add.i3 = fadd float %acc.i1, %acc.i3 +; CHECK: %add.upto0 = insertelement <4 x float> undef, float %add.i0, i32 0 +; CHECK: %add.upto1 = insertelement <4 x float> %add.upto0, float %add.i1, i32 1 +; CHECK: %add.upto2 = insertelement <4 x float> %add.upto1, float %add.i2, i32 2 +; CHECK: %add = insertelement <4 x float> %add.upto2, float %add.i3, i32 3 +; CHECK: %call = call <4 x float> @ext(<4 x float> %add) +; CHECK: %call.i0 = extractelement <4 x float> %call, i32 0 +; CHECK: %cmp.i0 = fcmp ogt float %call.i0, 1.0 +; CHECK: %call.i1 = extractelement <4 x float> %call, i32 1 +; CHECK: %cmp.i1 = fcmp ogt float %call.i1, 2.0 +; CHECK: %call.i2 = extractelement <4 x float> %call, i32 2 +; CHECK: %cmp.i2 = fcmp ogt float %call.i2, 3.0 +; CHECK: %call.i3 = extractelement <4 x float> %call, i32 3 +; CHECK: %cmp.i3 = fcmp ogt float %call.i3, 4.0 +; CHECK: %sel.i0 = select i1 %cmp.i0, float %call.i0, float 5.0 +; CHECK: %sel.i1 = select i1 %cmp.i1, float %call.i1, float 6.0 +; CHECK: %sel.i2 = select i1 %cmp.i2, float %call.i2, float 7.0 +; CHECK: %sel.i3 = select i1 %cmp.i3, float %call.i3, float 8.0 +; CHECK: store float %sel.i0, float* %ptr.i0 +; CHECK: store float %sel.i1, float* %ptr.i1 +; CHECK: store float %sel.i2, float* %ptr.i2 +; CHECK: store float %sel.i3, float* %ptr.i3 +; CHECK: %test = icmp eq i32 %nexti, 0 +; CHECK: br i1 %test, label %loop, label %exit +; CHECK: exit: +; CHECK: ret void +entry: + br label %loop + +loop: + %i = phi i32 [ %count, %entry ], [ %nexti, %loop ] + %acc = phi <4 x float> [ %init, %entry ], [ %sel, %loop ] + %nexti = sub i32 %i, 1 + + %ptr = getelementptr <4 x float> *%base, i32 %i + %val = load <4 x float> *%ptr + %dval = bitcast <4 x float> %val to <2 x double> + %dacc = bitcast <4 x float> %acc to <2 x double> + %shuffle1 = shufflevector <2 x double> %dval, <2 x double> %dacc, + <2 x i32> + %shuffle2 = shufflevector <2 x double> %dval, <2 x double> %dacc, + <2 x i32> + %f1 = bitcast <2 x double> %shuffle1 to <4 x float> + %f2 = bitcast <2 x double> %shuffle2 to <4 x float> + %add = fadd <4 x float> %f1, %f2 + %call = call <4 x float> @ext(<4 x float> %add) + %cmp = fcmp ogt <4 x float> %call, + + %sel = select <4 x i1> %cmp, <4 x float> %call, + <4 x float> + store <4 x float> %sel, <4 x float> *%ptr + + %test = icmp eq i32 %nexti, 0 + br i1 %test, label %loop, label %exit + +exit: + ret void +} + +define void @f2(<4 x i32> %init, <4 x i8> *%base, i32 %count) { +; CHECK-LABEL: define void @f2(<4 x i32> %init, <4 x i8>* %base, i32 %count) { +; CHECK: entry: +; CHECK: %init.i0 = extractelement <4 x i32> %init, i32 0 +; CHECK: %init.i1 = extractelement <4 x i32> %init, i32 1 +; CHECK: %init.i2 = extractelement <4 x i32> %init, i32 2 +; CHECK: %init.i3 = extractelement <4 x i32> %init, i32 3 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %i = phi i32 [ %count, %entry ], [ %nexti, %loop ] +; CHECK: %acc.i0 = phi i32 [ %init.i0, %entry ], [ %sel.i0, %loop ] +; CHECK: %acc.i1 = phi i32 [ %init.i1, %entry ], [ %sel.i1, %loop ] +; CHECK: %acc.i2 = phi i32 [ %init.i2, %entry ], [ %sel.i2, %loop ] +; CHECK: %acc.i3 = phi i32 [ %init.i3, %entry ], [ %sel.i3, %loop ] +; CHECK: %nexti = sub i32 %i, 1 +; CHECK: %ptr = getelementptr <4 x i8>* %base, i32 %i +; CHECK: %ptr.i0 = bitcast <4 x i8>* %ptr to i8* +; CHECK: %val.i0 = load i8* %ptr.i0, align 4 +; CHECK: %ptr.i1 = getelementptr i8* %ptr.i0, i32 1 +; CHECK: %val.i1 = load i8* %ptr.i1, align 1 +; CHECK: %ptr.i2 = getelementptr i8* %ptr.i0, i32 2 +; CHECK: %val.i2 = load i8* %ptr.i2, align 2 +; CHECK: %ptr.i3 = getelementptr i8* %ptr.i0, i32 3 +; CHECK: %val.i3 = load i8* %ptr.i3, align 1 +; CHECK: %ext.i0 = sext i8 %val.i0 to i32 +; CHECK: %ext.i1 = sext i8 %val.i1 to i32 +; CHECK: %ext.i2 = sext i8 %val.i2 to i32 +; CHECK: %ext.i3 = sext i8 %val.i3 to i32 +; CHECK: %add.i0 = add i32 %ext.i0, %acc.i0 +; CHECK: %add.i1 = add i32 %ext.i1, %acc.i1 +; CHECK: %add.i2 = add i32 %ext.i2, %acc.i2 +; CHECK: %add.i3 = add i32 %ext.i3, %acc.i3 +; CHECK: %cmp.i0 = icmp slt i32 %add.i0, -10 +; CHECK: %cmp.i1 = icmp slt i32 %add.i1, -11 +; CHECK: %cmp.i2 = icmp slt i32 %add.i2, -12 +; CHECK: %cmp.i3 = icmp slt i32 %add.i3, -13 +; CHECK: %sel.i0 = select i1 %cmp.i0, i32 %add.i0, i32 %i +; CHECK: %sel.i1 = select i1 %cmp.i1, i32 %add.i1, i32 %i +; CHECK: %sel.i2 = select i1 %cmp.i2, i32 %add.i2, i32 %i +; CHECK: %sel.i3 = select i1 %cmp.i3, i32 %add.i3, i32 %i +; CHECK: %trunc.i0 = trunc i32 %sel.i0 to i8 +; CHECK: %trunc.i1 = trunc i32 %sel.i1 to i8 +; CHECK: %trunc.i2 = trunc i32 %sel.i2 to i8 +; CHECK: %trunc.i3 = trunc i32 %sel.i3 to i8 +; CHECK: store i8 %trunc.i0, i8* %ptr.i0, align 4 +; CHECK: store i8 %trunc.i1, i8* %ptr.i1, align 1 +; CHECK: store i8 %trunc.i2, i8* %ptr.i2, align 2 +; CHECK: store i8 %trunc.i3, i8* %ptr.i3, align 1 +; CHECK: %test = icmp eq i32 %nexti, 0 +; CHECK: br i1 %test, label %loop, label %exit +; CHECK: exit: +; CHECK: ret void +entry: + br label %loop + +loop: + %i = phi i32 [ %count, %entry ], [ %nexti, %loop ] + %acc = phi <4 x i32> [ %init, %entry ], [ %sel, %loop ] + %nexti = sub i32 %i, 1 + + %ptr = getelementptr <4 x i8> *%base, i32 %i + %val = load <4 x i8> *%ptr + %ext = sext <4 x i8> %val to <4 x i32> + %add = add <4 x i32> %ext, %acc + %cmp = icmp slt <4 x i32> %add, + %single = insertelement <4 x i32> undef, i32 %i, i32 0 + %limit = shufflevector <4 x i32> %single, <4 x i32> undef, + <4 x i32> zeroinitializer + %sel = select <4 x i1> %cmp, <4 x i32> %add, <4 x i32> %limit + %trunc = trunc <4 x i32> %sel to <4 x i8> + store <4 x i8> %trunc, <4 x i8> *%ptr + + %test = icmp eq i32 %nexti, 0 + br i1 %test, label %loop, label %exit + +exit: + ret void +} + +; Check that !tbaa information is preserved. +define void @f3(<4 x i32> *%src, <4 x i32> *%dst) { +; CHECK-LABEL: @f3( +; CHECK: %val.i0 = load i32* %src.i0, align 16, !tbaa ![[TAG:[0-9]*]] +; CHECK: %val.i1 = load i32* %src.i1, align 4, !tbaa ![[TAG]] +; CHECK: %val.i2 = load i32* %src.i2, align 8, !tbaa ![[TAG]] +; CHECK: %val.i3 = load i32* %src.i3, align 4, !tbaa ![[TAG]] +; CHECK: store i32 %add.i0, i32* %dst.i0, align 16, !tbaa ![[TAG:[0-9]*]] +; CHECK: store i32 %add.i1, i32* %dst.i1, align 4, !tbaa ![[TAG]] +; CHECK: store i32 %add.i2, i32* %dst.i2, align 8, !tbaa ![[TAG]] +; CHECK: store i32 %add.i3, i32* %dst.i3, align 4, !tbaa ![[TAG]] +; CHECK: ret void + %val = load <4 x i32> *%src, !tbaa !1 + %add = add <4 x i32> %val, %val + store <4 x i32> %add, <4 x i32> *%dst, !tbaa !2 + ret void +} + +; Check that !tbaa.struct information is preserved. +define void @f4(<4 x i32> *%src, <4 x i32> *%dst) { +; CHECK-LABEL: @f4( +; CHECK: %val.i0 = load i32* %src.i0, align 16, !tbaa.struct ![[TAG:[0-9]*]] +; CHECK: %val.i1 = load i32* %src.i1, align 4, !tbaa.struct ![[TAG]] +; CHECK: %val.i2 = load i32* %src.i2, align 8, !tbaa.struct ![[TAG]] +; CHECK: %val.i3 = load i32* %src.i3, align 4, !tbaa.struct ![[TAG]] +; CHECK: store i32 %add.i0, i32* %dst.i0, align 16, !tbaa.struct ![[TAG]] +; CHECK: store i32 %add.i1, i32* %dst.i1, align 4, !tbaa.struct ![[TAG]] +; CHECK: store i32 %add.i2, i32* %dst.i2, align 8, !tbaa.struct ![[TAG]] +; CHECK: store i32 %add.i3, i32* %dst.i3, align 4, !tbaa.struct ![[TAG]] +; CHECK: ret void + %val = load <4 x i32> *%src, !tbaa.struct !5 + %add = add <4 x i32> %val, %val + store <4 x i32> %add, <4 x i32> *%dst, !tbaa.struct !5 + ret void +} + +; Check that llvm.mem.parallel_loop_access information is preserved. +define void @f5(i32 %count, <4 x i32> *%src, <4 x i32> *%dst) { +; CHECK-LABEL: @f5( +; CHECK: %val.i0 = load i32* %this_src.i0, align 16, !llvm.mem.parallel_loop_access ![[TAG:[0-9]*]] +; CHECK: %val.i1 = load i32* %this_src.i1, align 4, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: %val.i2 = load i32* %this_src.i2, align 8, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: %val.i3 = load i32* %this_src.i3, align 4, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: store i32 %add.i0, i32* %this_dst.i0, align 16, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: store i32 %add.i1, i32* %this_dst.i1, align 4, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: store i32 %add.i2, i32* %this_dst.i2, align 8, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: store i32 %add.i3, i32* %this_dst.i3, align 4, !llvm.mem.parallel_loop_access ![[TAG]] +; CHECK: ret void +entry: + br label %loop + +loop: + %index = phi i32 [ 0, %entry ], [ %next_index, %loop ] + %this_src = getelementptr <4 x i32> *%src, i32 %index + %this_dst = getelementptr <4 x i32> *%dst, i32 %index + %val = load <4 x i32> *%this_src, !llvm.mem.parallel_loop_access !3 + %add = add <4 x i32> %val, %val + store <4 x i32> %add, <4 x i32> *%this_dst, !llvm.mem.parallel_loop_access !3 + %next_index = add i32 %index, -1 + %continue = icmp ne i32 %next_index, %count + br i1 %continue, label %loop, label %end, !llvm.loop !3 + +end: + ret void +} + +; Check that fpmath information is preserved. +define <4 x float> @f6(<4 x float> %x) { +; CHECK-LABEL: @f6( +; CHECK: %x.i0 = extractelement <4 x float> %x, i32 0 +; CHECK: %res.i0 = fadd float %x.i0, 1.0{{[e+0]*}}, !fpmath ![[TAG:[0-9]*]] +; CHECK: %x.i1 = extractelement <4 x float> %x, i32 1 +; CHECK: %res.i1 = fadd float %x.i1, 2.0{{[e+0]*}}, !fpmath ![[TAG]] +; CHECK: %x.i2 = extractelement <4 x float> %x, i32 2 +; CHECK: %res.i2 = fadd float %x.i2, 3.0{{[e+0]*}}, !fpmath ![[TAG]] +; CHECK: %x.i3 = extractelement <4 x float> %x, i32 3 +; CHECK: %res.i3 = fadd float %x.i3, 4.0{{[e+0]*}}, !fpmath ![[TAG]] +; CHECK: %res.upto0 = insertelement <4 x float> undef, float %res.i0, i32 0 +; CHECK: %res.upto1 = insertelement <4 x float> %res.upto0, float %res.i1, i32 1 +; CHECK: %res.upto2 = insertelement <4 x float> %res.upto1, float %res.i2, i32 2 +; CHECK: %res = insertelement <4 x float> %res.upto2, float %res.i3, i32 3 +; CHECK: ret <4 x float> %res + %res = fadd <4 x float> %x, , + !fpmath !4 + ret <4 x float> %res +} + +; Check that random metadata isn't kept. +define void @f7(<4 x i32> *%src, <4 x i32> *%dst) { +; CHECK-LABEL: @f7( +; CHECK-NOT: !foo +; CHECK: ret void + %val = load <4 x i32> *%src, !foo !5 + %add = add <4 x i32> %val, %val + store <4 x i32> %add, <4 x i32> *%dst, !foo !5 + ret void +} + +; Test GEP with vectors. +define void @f8(<4 x float *> *%dest, <4 x float *> %ptr0, <4 x i32> %i0, + float *%other) { +; CHECK-LABEL: @f8( +; CHECK: %dest.i0 = bitcast <4 x float*>* %dest to float** +; CHECK: %dest.i1 = getelementptr float** %dest.i0, i32 1 +; CHECK: %dest.i2 = getelementptr float** %dest.i0, i32 2 +; CHECK: %dest.i3 = getelementptr float** %dest.i0, i32 3 +; CHECK: %i0.i1 = extractelement <4 x i32> %i0, i32 1 +; CHECK: %i0.i3 = extractelement <4 x i32> %i0, i32 3 +; CHECK: %ptr0.i0 = extractelement <4 x float*> %ptr0, i32 0 +; CHECK: %val.i0 = getelementptr float* %ptr0.i0, i32 100 +; CHECK: %val.i1 = getelementptr float* %other, i32 %i0.i1 +; CHECK: %ptr0.i2 = extractelement <4 x float*> %ptr0, i32 2 +; CHECK: %val.i2 = getelementptr float* %ptr0.i2, i32 100 +; CHECK: %ptr0.i3 = extractelement <4 x float*> %ptr0, i32 3 +; CHECK: %val.i3 = getelementptr float* %ptr0.i3, i32 %i0.i3 +; CHECK: store float* %val.i0, float** %dest.i0, align 32 +; CHECK: store float* %val.i1, float** %dest.i1, align 8 +; CHECK: store float* %val.i2, float** %dest.i2, align 16 +; CHECK: store float* %val.i3, float** %dest.i3, align 8 +; CHECK: ret void + %i1 = insertelement <4 x i32> %i0, i32 100, i32 0 + %i2 = insertelement <4 x i32> %i1, i32 100, i32 2 + %ptr1 = insertelement <4 x float *> %ptr0, float *%other, i32 1 + %val = getelementptr <4 x float *> %ptr1, <4 x i32> %i2 + store <4 x float *> %val, <4 x float *> *%dest + ret void +} + +; Test the handling of unaligned loads. +define void @f9(<4 x float> *%dest, <4 x float> *%src) { +; CHECK: @f9( +; CHECK: %dest.i0 = bitcast <4 x float>* %dest to float* +; CHECK: %dest.i1 = getelementptr float* %dest.i0, i32 1 +; CHECK: %dest.i2 = getelementptr float* %dest.i0, i32 2 +; CHECK: %dest.i3 = getelementptr float* %dest.i0, i32 3 +; CHECK: %src.i0 = bitcast <4 x float>* %src to float* +; CHECK: %val.i0 = load float* %src.i0, align 4 +; CHECK: %src.i1 = getelementptr float* %src.i0, i32 1 +; CHECK: %val.i1 = load float* %src.i1, align 4 +; CHECK: %src.i2 = getelementptr float* %src.i0, i32 2 +; CHECK: %val.i2 = load float* %src.i2, align 4 +; CHECK: %src.i3 = getelementptr float* %src.i0, i32 3 +; CHECK: %val.i3 = load float* %src.i3, align 4 +; CHECK: store float %val.i0, float* %dest.i0, align 8 +; CHECK: store float %val.i1, float* %dest.i1, align 4 +; CHECK: store float %val.i2, float* %dest.i2, align 8 +; CHECK: store float %val.i3, float* %dest.i3, align 4 +; CHECK: ret void + %val = load <4 x float> *%src, align 4 + store <4 x float> %val, <4 x float> *%dest, align 8 + ret void +} + +; ...and again with subelement alignment. +define void @f10(<4 x float> *%dest, <4 x float> *%src) { +; CHECK: @f10( +; CHECK: %dest.i0 = bitcast <4 x float>* %dest to float* +; CHECK: %dest.i1 = getelementptr float* %dest.i0, i32 1 +; CHECK: %dest.i2 = getelementptr float* %dest.i0, i32 2 +; CHECK: %dest.i3 = getelementptr float* %dest.i0, i32 3 +; CHECK: %src.i0 = bitcast <4 x float>* %src to float* +; CHECK: %val.i0 = load float* %src.i0, align 1 +; CHECK: %src.i1 = getelementptr float* %src.i0, i32 1 +; CHECK: %val.i1 = load float* %src.i1, align 1 +; CHECK: %src.i2 = getelementptr float* %src.i0, i32 2 +; CHECK: %val.i2 = load float* %src.i2, align 1 +; CHECK: %src.i3 = getelementptr float* %src.i0, i32 3 +; CHECK: %val.i3 = load float* %src.i3, align 1 +; CHECK: store float %val.i0, float* %dest.i0, align 2 +; CHECK: store float %val.i1, float* %dest.i1, align 2 +; CHECK: store float %val.i2, float* %dest.i2, align 2 +; CHECK: store float %val.i3, float* %dest.i3, align 2 +; CHECK: ret void + %val = load <4 x float> *%src, align 1 + store <4 x float> %val, <4 x float> *%dest, align 2 + ret void +} + +; Test that sub-byte loads aren't scalarized. +define void @f11(<32 x i1> *%dest, <32 x i1> *%src0) { +; CHECK: @f11( +; CHECK: %val0 = load <32 x i1>* %src0 +; CHECK: %val1 = load <32 x i1>* %src1 +; CHECK: store <32 x i1> %and, <32 x i1>* %dest +; CHECK: ret void + %src1 = getelementptr <32 x i1> *%src0, i32 1 + %val0 = load <32 x i1> *%src0 + %val1 = load <32 x i1> *%src1 + %and = and <32 x i1> %val0, %val1 + store <32 x i1> %and, <32 x i1> *%dest + ret void +} + +; Test that variable inserts aren't scalarized. +define void @f12(<4 x i32> *%dest, <4 x i32> *%src, i32 %index) { +; CHECK: @f12( +; CHECK: %val1 = insertelement <4 x i32> %val0, i32 1, i32 %index +; CHECK-DAG: %val1.i0 = extractelement <4 x i32> %val1, i32 0 +; CHECK-DAG: %val1.i1 = extractelement <4 x i32> %val1, i32 1 +; CHECK-DAG: %val1.i2 = extractelement <4 x i32> %val1, i32 2 +; CHECK-DAG: %val1.i3 = extractelement <4 x i32> %val1, i32 3 +; CHECK-DAG: %val2.i0 = shl i32 1, %val1.i0 +; CHECK-DAG: %val2.i1 = shl i32 2, %val1.i1 +; CHECK-DAG: %val2.i2 = shl i32 3, %val1.i2 +; CHECK-DAG: %val2.i3 = shl i32 4, %val1.i3 +; CHECK: ret void + %val0 = load <4 x i32> *%src + %val1 = insertelement <4 x i32> %val0, i32 1, i32 %index + %val2 = shl <4 x i32> , %val1 + store <4 x i32> %val2, <4 x i32> *%dest + ret void +} + +!0 = metadata !{ metadata !"root" } +!1 = metadata !{ metadata !"set1", metadata !0 } +!2 = metadata !{ metadata !"set2", metadata !0 } +!3 = metadata !{ metadata !3 } +!4 = metadata !{ float 4.0 } +!5 = metadata !{ i64 0, i64 8, null } Index: test/Transforms/Scalarizer/dbginfo.ll =================================================================== --- /dev/null +++ test/Transforms/Scalarizer/dbginfo.ll @@ -0,0 +1,85 @@ +; RUN: opt %s -scalarizer -scalarize-load-store -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Function Attrs: nounwind uwtable +define void @f1(<4 x i32>* nocapture %a, <4 x i32>* nocapture readonly %b, <4 x i32>* nocapture readonly %c) #0 { +; CHECK: @f1( +; CHECK: %a.i0 = bitcast <4 x i32>* %a to i32* +; CHECK: %a.i1 = getelementptr i32* %a.i0, i32 1 +; CHECK: %a.i2 = getelementptr i32* %a.i0, i32 2 +; CHECK: %a.i3 = getelementptr i32* %a.i0, i32 3 +; CHECK: %c.i0 = bitcast <4 x i32>* %c to i32* +; CHECK: %c.i1 = getelementptr i32* %c.i0, i32 1 +; CHECK: %c.i2 = getelementptr i32* %c.i0, i32 2 +; CHECK: %c.i3 = getelementptr i32* %c.i0, i32 3 +; CHECK: %b.i0 = bitcast <4 x i32>* %b to i32* +; CHECK: %b.i1 = getelementptr i32* %b.i0, i32 1 +; CHECK: %b.i2 = getelementptr i32* %b.i0, i32 2 +; CHECK: %b.i3 = getelementptr i32* %b.i0, i32 3 +; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} +; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} +; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} +; CHECK: %bval.i0 = load i32* %b.i0, align 16, !dbg ![[TAG1:[0-9]+]], !tbaa ![[TAG2:[0-9]+]] +; CHECK: %bval.i1 = load i32* %b.i1, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %bval.i2 = load i32* %b.i2, align 8, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %bval.i3 = load i32* %b.i3, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %cval.i0 = load i32* %c.i0, align 16, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %cval.i1 = load i32* %c.i1, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %cval.i2 = load i32* %c.i2, align 8, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %cval.i3 = load i32* %c.i3, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: %add.i0 = add i32 %bval.i0, %cval.i0, !dbg ![[TAG1]] +; CHECK: %add.i1 = add i32 %bval.i1, %cval.i1, !dbg ![[TAG1]] +; CHECK: %add.i2 = add i32 %bval.i2, %cval.i2, !dbg ![[TAG1]] +; CHECK: %add.i3 = add i32 %bval.i3, %cval.i3, !dbg ![[TAG1]] +; CHECK: store i32 %add.i0, i32* %a.i0, align 16, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: store i32 %add.i1, i32* %a.i1, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: store i32 %add.i2, i32* %a.i2, align 8, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: store i32 %add.i3, i32* %a.i3, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]] +; CHECK: ret void +entry: + tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !15), !dbg !20 + tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !16), !dbg !20 + tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !17), !dbg !20 + %bval = load <4 x i32>* %b, align 16, !dbg !21, !tbaa !22 + %cval = load <4 x i32>* %c, align 16, !dbg !21, !tbaa !22 + %add = add <4 x i32> %bval, %cval, !dbg !21 + store <4 x i32> %add, <4 x i32>* %a, align 16, !dbg !21, !tbaa !22 + ret void, !dbg !25 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata) #1 + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!18} +!llvm.ident = !{!19} + +!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 194134) (llvm/trunk 194126)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/richards/llvm/build//tmp/add.c] [DW_LANG_C99] +!1 = metadata !{metadata !"/tmp/add.c", metadata !"/home/richards/llvm/build"} +!2 = metadata !{i32 0} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @f1, null, null, metadata !14, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [f] +!5 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/home/richards/llvm/build//tmp/add.c] +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!7 = metadata !{null, metadata !8, metadata !8, metadata !8} +!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from V4SI] +!9 = metadata !{i32 786454, metadata !1, null, metadata !"V4SI", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ] [V4SI] [line 1, size 0, align 0, offset 0] [from ] +!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int] +!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!12 = metadata !{metadata !13} +!13 = metadata !{i32 786465, i64 0, i64 4} ; [ DW_TAG_subrange_type ] [0, 3] +!14 = metadata !{metadata !15, metadata !16, metadata !17} +!15 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3] +!16 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 33554435, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 3] +!17 = metadata !{i32 786689, metadata !4, metadata !"c", metadata !5, i32 50331651, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 3] +!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4} +!19 = metadata !{metadata !"clang version 3.4 (trunk 194134) (llvm/trunk 194126)"} +!20 = metadata !{i32 3, i32 0, metadata !4, null} +!21 = metadata !{i32 5, i32 0, metadata !4, null} +!22 = metadata !{metadata !23, metadata !23, i64 0} +!23 = metadata !{metadata !"omnipotent char", metadata !24, i64 0} +!24 = metadata !{metadata !"Simple C/C++ TBAA"} +!25 = metadata !{i32 6, i32 0, metadata !4, null} Index: test/Transforms/Scalarizer/no-data-layout.ll =================================================================== --- /dev/null +++ test/Transforms/Scalarizer/no-data-layout.ll @@ -0,0 +1,25 @@ +; RUN: opt %s -scalarizer -scalarize-load-store -S | FileCheck %s + +; Test the handling of loads and stores when no data layout is available. +define void @f1(<4 x float> *%dest, <4 x float> *%src) { +; CHECK: @f1( +; CHECK: %val = load <4 x float>* %src, align 4 +; CHECK: %val.i0 = extractelement <4 x float> %val, i32 0 +; CHECK: %add.i0 = fadd float %val.i0, %val.i0 +; CHECK: %val.i1 = extractelement <4 x float> %val, i32 1 +; CHECK: %add.i1 = fadd float %val.i1, %val.i1 +; CHECK: %val.i2 = extractelement <4 x float> %val, i32 2 +; CHECK: %add.i2 = fadd float %val.i2, %val.i2 +; CHECK: %val.i3 = extractelement <4 x float> %val, i32 3 +; CHECK: %add.i3 = fadd float %val.i3, %val.i3 +; CHECK: %add.upto0 = insertelement <4 x float> undef, float %add.i0, i32 0 +; CHECK: %add.upto1 = insertelement <4 x float> %add.upto0, float %add.i1, i32 1 +; CHECK: %add.upto2 = insertelement <4 x float> %add.upto1, float %add.i2, i32 2 +; CHECK: %add = insertelement <4 x float> %add.upto2, float %add.i3, i32 3 +; CHECK: store <4 x float> %add, <4 x float>* %dest, align 8 +; CHECK: ret void + %val = load <4 x float> *%src, align 4 + %add = fadd <4 x float> %val, %val + store <4 x float> %add, <4 x float> *%dest, align 8 + ret void +}