Index: ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1030,6 +1030,15 @@ return InterleaveGroupMap.count(Instr); } + bool isInterleavedAndProfitable(Instruction *Instr, unsigned VF) const { + return InterleaveGroupMap.count(Instr) && + !PreferGatherSet.count(std::make_pair(Instr, VF)); + } + + void preferGather(Instruction *Instr, unsigned VF) { + PreferGatherSet.insert(std::make_pair(Instr, VF)); + } + /// \brief Return the maximum interleave factor of all interleaved groups. unsigned getMaxInterleaveFactor() const { unsigned MaxFactor = 1; @@ -1073,6 +1082,8 @@ /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; + SmallSet, 4> PreferGatherSet; + /// Holds dependences among the memory accesses in the loop. It maps a source /// access to a set of dependent sink accesses. DenseMap> Dependences; @@ -1618,6 +1629,14 @@ return InterleaveInfo.isInterleaved(Instr); } + bool isAccessInterleavedAndProfitable(Instruction *Instr, unsigned VF) { + return InterleaveInfo.isInterleavedAndProfitable(Instr, VF); + } + + void preferGather(Instruction *Instr, unsigned VF) { + return InterleaveInfo.preferGather(Instr, VF); + } + /// \brief Return the maximum interleave factor of all interleaved groups. unsigned getMaxInterleaveFactor() const { return InterleaveInfo.getMaxInterleaveFactor(); @@ -2812,7 +2831,7 @@ assert((LI || SI) && "Invalid Load/Store instruction"); // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + if (Legal->isAccessInterleavedAndProfitable(Instr, VF)) return vectorizeInterleaveGroup(Instr); Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); @@ -7012,12 +7031,26 @@ if (Legal->isAccessInterleaved(I)) { auto Group = Legal->getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); + unsigned InterleaveFactor = Group->getFactor(); + // Instructions may be combined in "interleaved access" groups, but the + // "gather" operation for each of them may be cheaper. + // I do not compare "gather" cost vs "interleave pattern", I just assume + // that each target provides reasonable MaxInterleaveFactor that + // makes the "interleave pattern" profitable. When InterleaveFactor + // exceeds the maximum provided by TTI, the Gather, if applicable, + // becomes better. + if (InterleaveFactor > TTI.getMaxInterleaveFactor(VF) && + Legal->isLegalGatherOrScatter(I)) { + Legal->preferGather(I, VF); + return TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); + } // Only calculate the cost once at the insert position. if (Group->getInsertPos() != I) return 0; - unsigned InterleaveFactor = Group->getFactor(); Type *WideVecTy = VectorType::get(VectorTy->getVectorElementType(), VectorTy->getVectorNumElements() * InterleaveFactor); Index: ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll =================================================================== --- ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll +++ ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -O2 -S + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Source code: +; void foo() { +; for (int i=0; i