Index: ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../../ver4/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1030,6 +1030,18 @@ return InterleaveGroupMap.count(Instr); } + /// \brief Check if \p Instr belongs to any interleave group but Gather + /// variant should be used for vectorization with certain \p VF. + bool isInterleavedAndProfitable(Instruction *Instr, unsigned VF) const { + return InterleaveGroupMap.count(Instr) && + !PreferGatherSet.count(std::make_pair(Instr, VF)); + } + + /// \brief Prefer Gather vs Interleave for certain \p VF. + void preferGather(Instruction *Instr, unsigned VF) { + PreferGatherSet.insert(std::make_pair(Instr, VF)); + } + /// \brief Return the maximum interleave factor of all interleaved groups. unsigned getMaxInterleaveFactor() const { unsigned MaxFactor = 1; @@ -1073,6 +1085,10 @@ /// Holds the relationships between the members and the interleave group. DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; + /// Holds list of instructions that should be vectorized to "gather" for + /// a certain VF. + SmallSet<std::pair<Instruction *, unsigned>, 4> PreferGatherSet; + /// Holds dependences among the memory accesses in the loop. It maps a source /// access to a set of dependent sink accesses. DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; @@ -1618,6 +1634,17 @@ return InterleaveInfo.isInterleaved(Instr); } + /// \brief Check if \p Instr belongs to any interleaved access group + /// and interleaving is profitable under certain \p VF. + bool isAccessInterleavedAndProfitable(Instruction *Instr, unsigned VF) { + return InterleaveInfo.isInterleavedAndProfitable(Instr, VF); + } + + /// \brief Prefer Gather vs Interleave for certain \p VF. + void preferGather(Instruction *Instr, unsigned VF) { + return InterleaveInfo.preferGather(Instr, VF); + } + /// \brief Return the maximum interleave factor of all interleaved groups. unsigned getMaxInterleaveFactor() const { return InterleaveInfo.getMaxInterleaveFactor(); @@ -2811,8 +2838,9 @@ assert((LI || SI) && "Invalid Load/Store instruction"); - // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + // Try to vectorize the interleave group if this access is interleaved + // and this kind of vectorization is profitable. + if (Legal->isAccessInterleavedAndProfitable(Instr, VF)) return vectorizeInterleaveGroup(Instr); Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); @@ -7012,12 +7040,26 @@ if (Legal->isAccessInterleaved(I)) { auto Group = Legal->getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); + unsigned InterleaveFactor = Group->getFactor(); + // Instructions may be combined in "interleaved access" groups, but the + // "gather" operation for each of them may be cheaper. + // I do not compare "gather" cost vs "interleave pattern", I just assume + // that each target provides reasonable MaxInterleaveFactor that + // makes the "interleave pattern" profitable. When InterleaveFactor + // exceeds the maximum provided by TTI, the Gather, if applicable, + // becomes better. + if (InterleaveFactor > TTI.getMaxInterleaveFactor(VF) && + Legal->isLegalGatherOrScatter(I)) { + Legal->preferGather(I, VF); + return TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); + } // Only calculate the cost once at the insert position. if (Group->getInsertPos() != I) return 0; - unsigned InterleaveFactor = Group->getFactor(); Type *WideVecTy = VectorType::get(VectorTy->getVectorElementType(), VectorTy->getVectorNumElements() * InterleaveFactor); Index: ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll =================================================================== --- ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll +++ ../../ver4/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -O2 -S + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Source code: +; void foo() { +; for (int i=0; i<N; i++) +; B[i] = A[i*5]; +; } +; Interleave pattern is not profitable for stride 5. +; In this case we should use "gather" + +;CHECK-LABEL: @foo +;CHECK: llvm.masked.gather.v16i32 +;CHECK: ret void + +define void @foo() local_unnamed_addr #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nuw nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skx" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +