Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -454,6 +454,9 @@ /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + /// \brief Split struct fields passed via GPRs in SROA + bool enableSplitStructArgs() const; + /// \brief Indicate that it is potentially unsafe to automatically vectorize /// floating-point operations because the semantics of vector and scalar /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math @@ -806,6 +809,7 @@ virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool enableSplitStructArgs() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -1020,6 +1024,9 @@ bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool enableSplitStructArgs() override { + return Impl.enableSplitStructArgs(); + } bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -274,6 +274,8 @@ bool enableInterleavedAccessVectorization() { return false; } + bool enableSplitStructArgs() { return false; } + bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, Index: include/llvm/Transforms/Scalar/SROA.h =================================================================== --- include/llvm/Transforms/Scalar/SROA.h +++ include/llvm/Transforms/Scalar/SROA.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" @@ -59,6 +60,7 @@ LLVMContext *C = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; + TargetTransformInfo *TTI = nullptr; /// \brief Worklist of alloca instructions to simplify. /// @@ -114,7 +116,7 @@ /// Helper used by both the public run method and by the legacy pass. PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC); + AssumptionCache &RunAC, TargetTransformInfo &RunTTI); bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -215,6 +215,10 @@ return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::enableSplitStructArgs() const { + return TTIImpl->enableSplitStructArgs(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -61,6 +61,8 @@ bool enableAggressiveInterleaving(bool LoopHasReductions); bool enableInterleavedAccessVectorization(); + bool enableSplitStructArgs(); + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getCacheLineSize(); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -447,3 +447,8 @@ return Cost; } +bool PPCTTIImpl::enableSplitStructArgs() { + if (ST->isPPC64() && ST->isELFv2ABI()) return true; + return false; +} + Index: lib/Transforms/Scalar/SROA.cpp =================================================================== --- lib/Transforms/Scalar/SROA.cpp +++ lib/Transforms/Scalar/SROA.cpp @@ -2988,11 +2988,14 @@ /// value (as opposed to the user). Use *U; + bool SplitStructArgs = false; + public: /// Rewrite loads and stores through a pointer and all pointers derived from /// it. - bool rewrite(Instruction &I) { + bool rewrite(Instruction &I, bool S) { DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + SplitStructArgs = S; enqueueUsers(I); bool Changed = false; while (!Queue.empty()) { @@ -3140,6 +3143,90 @@ } }; + bool emitSplitOpsForArgs(StoreInst &SI) { + /* + %struct.record = type { i64, i32, i32 } + + define signext i32 @ppc64le_func([2 x i64] %r.coerce) #0 { + entry: + %r = alloca %struct.record, align 8 + %0 = bitcast %struct.record* %r to [2 x i64]* + store [2 x i64] %r.coerce, [2 x i64]* %0, align 8 + + define i32 @x86_64_func(i64 %r.coerce0, i64 %r.coerce1) #0 { + entry: + %r = alloca %struct.record, align 8 + %0 = bitcast %struct.record* %r to { i64, i64 }* + %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0 + store i64 %r.coerce0, i64* %1, align 8 + %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1 + store i64 %r.coerce1, i64* %2, align 8 + */ + Value *V = SI.getValueOperand(); + assert(isa(V)); + ArrayType *ATy = dyn_cast(V->getType()); + + // We optimize when a struct parameter is passed as a 64-bit integer array in GRPs (e.g. ppc64) + if (!ATy || !ATy->getElementType()->isIntegerTy(64)) + return false; + + BitCastInst *BCI = dyn_cast(SI.getPointerOperand()); + if (BCI == NULL) return false; + + AllocaInst *AI = dyn_cast(BCI->getOperand(0)); + if (AI == NULL) return false; + + StructType *STy = dyn_cast(const_cast(AI->getAllocatedType())); + if (STy == NULL) return false; + + const DataLayout &DL = AI->getModule()->getDataLayout(); + SmallVector Indices; + SmallVector GEPIndices; + Type *Int32Ty = Type::getInt32Ty(STy->getContext()); + GEPIndices.push_back(ConstantInt::get(Int32Ty, 0)); + StructType::element_iterator I = STy->element_begin(); + StructType::element_iterator E = STy->element_end(); + + // nested struct is not supprted now + for (; I != E; I++) { + Type *Ty = *I; + if (!Ty->isSingleValueType()) return false; + } + + IRBuilderTy IRB(&SI); + unsigned Idx = 0; + for (I = STy->element_begin(); I != E; I++, Idx++) { + Type *Ty = *I; + GEPIndices.push_back(ConstantInt::get(Int32Ty, Idx)); + unsigned Offset = DL.getIndexedOffsetInType(STy, GEPIndices); + + Indices.push_back(Offset / 8); + Value *ExtractedI64 = + IRB.CreateExtractValue(V, Indices, V->getName() + "Extract"); + Indices.pop_back(); + + Value *InBoundsGEP = + IRB.CreateInBoundsGEP(AI, GEPIndices); + GEPIndices.pop_back(); + Value *Store; + if (DL.getTypeStoreSize(Ty) < DL.getTypeStoreSize(ExtractedI64->getType())) { + IntegerType *ExtractTy = Type::getIntNTy(SI.getContext(), DL.getTypeStoreSize(Ty) * 8); + Value *Extracted = extractInteger(DL, IRB, ExtractedI64, ExtractTy, Offset % 8, "extract"); + Value *PtrInt = IRB.CreateBitCast(InBoundsGEP, ExtractTy->getPointerTo(SI.getPointerAddressSpace())); + Store = IRB.CreateStore(Extracted, PtrInt); + } + else { + IntegerType *ExtractTy = Type::getIntNTy(SI.getContext(), DL.getTypeStoreSize(Ty) * 8); + Value *PtrInt = IRB.CreateBitCast(InBoundsGEP, ExtractTy->getPointerTo(SI.getPointerAddressSpace())); + Store = IRB.CreateStore(ExtractedI64, PtrInt); + } + + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + return true; + } + bool visitStoreInst(StoreInst &SI) { if (!SI.isSimple() || SI.getPointerOperand() != *U) return false; @@ -3149,6 +3236,10 @@ // We have an aggregate being stored, split it apart. DEBUG(dbgs() << " original: " << SI << "\n"); + if (SplitStructArgs && isa(V) && emitSplitOpsForArgs(SI)) { + SI.eraseFromParent(); + return true; + } StoreOpSplitter Splitter(&SI, *U); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); SI.eraseFromParent(); @@ -4100,7 +4191,7 @@ // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. AggLoadStoreRewriter AggRewriter; - Changed |= AggRewriter.rewrite(AI); + Changed |= AggRewriter.rewrite(AI, TTI->enableSplitStructArgs()); // Build the slices using a recursive instruction-visiting builder. AllocaSlices AS(DL, AI); @@ -4197,11 +4288,12 @@ } PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC) { + AssumptionCache &RunAC, TargetTransformInfo &RunTTI) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; + TTI = &RunTTI; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4249,7 +4341,8 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { return runImpl(F, AM.getResult(F), - AM.getResult(F)); + AM.getResult(F), + AM.getResult(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. @@ -4270,12 +4363,14 @@ auto PA = Impl.runImpl( F, getAnalysis().getDomTree(), - getAnalysis().getAssumptionCache(F)); + getAnalysis().getAssumptionCache(F), + getAnalysis().getTTI(F)); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); }