Index: llvm/trunk/include/llvm/Analysis/VectorUtils.h =================================================================== --- llvm/trunk/include/llvm/Analysis/VectorUtils.h +++ llvm/trunk/include/llvm/Analysis/VectorUtils.h @@ -308,6 +308,23 @@ propagateMetadata(NewInst, VL); } + /// Returns true if this Group requires a scalar iteration to handle gaps. + bool requiresScalarEpilogue() const { + // If Group has no gaps, or has gaps but the last member exists, then a + // scalar epilog is not needed for this group. + if (getNumMembers() == getFactor() || getMember(getFactor() - 1)) + return false; + + // We have a group with gaps. It therefore cannot be a group of stores, + // and it can't be a reversed access, because such groups get invalidated. + assert(!getMember(0)->mayWriteToMemory() && + "Group should have been invalidated"); + assert(!isReverse() && "Group should have been invalidated"); + + // This is a group of loads, with gaps, and without a last-member + return true; + } + private: unsigned Factor; // Interleave Factor. bool Reverse; @@ -388,6 +405,11 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + /// Invalidate groups that require a scalar epilogue (due to gaps). This can + /// happen when we optimize for size and don't allow creating a scalar + /// epilogue. + void invalidateGroupsRequiringScalarEpilogue(); + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. Index: llvm/trunk/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/trunk/lib/Analysis/VectorUtils.cpp +++ llvm/trunk/lib/Analysis/VectorUtils.cpp @@ -919,3 +919,27 @@ } } } + +void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { + // If no group had triggered the requirement to create an epilogue loop, + // there is nothing to do. + if (!requiresScalarEpilogue()) + return; + + // Avoid releasing a Group twice. + SmallPtrSet DelSet; + for (auto &I : InterleaveGroupMap) { + InterleaveGroup *Group = I.second; + if (Group->requiresScalarEpilogue()) + DelSet.insert(Group); + } + for (auto *Ptr : DelSet) { + LLVM_DEBUG( + dbgs() + << "LV: Invalidate candidate interleaved group due to gaps that " + "require a scalar epilogue.\n"); + releaseGroup(Ptr); + } + + RequiresScalarEpilogue = false; +} Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4599,6 +4599,14 @@ return None; } + // Record that scalar epilogue is not allowed. + LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps " + "due to -Os/-Oz.\n"); + + // We don't create an epilogue when optimizing for size. + // Invalidate interleave groups that require an epilogue. + InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); if (TC > 0 && TC % MaxVF == 0) { @@ -4610,8 +4618,6 @@ // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - // FIXME: return None if loop requiresScalarEpilog(), or look for a - // smaller MaxVF that does not require a scalar epilog. if (Legal->canFoldTailByMasking()) { FoldTailByMasking = true; return MaxVF; Index: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -1,5 +1,5 @@ -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" @@ -9,9 +9,13 @@ ; interleaved-group but rather as a scalarized accesses. ; (For SKX, Gather is not supported by the compiler for chars, therefore ; the only remaining alternative is to scalarize). +; In this case a scalar epilogue is not needed. +; ; When masked-interleave-group is enabled we expect to find the proper mask ; shuffling code, feeding the wide masked load for an interleave-group (with ; a single member). +; Since the last (second) member of the load-group is a gap, peeling is used, +; so we also expect to find a scalar epilogue loop. ; ; void masked_strided1(const unsigned char* restrict p, ; unsigned char* restrict q, @@ -38,6 +42,8 @@ ;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = ;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. ;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED-NOT: for.body: +;DISABLED_MASKED_STRIDED: for.end: ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( ;ENABLED_MASKED_STRIDED: vector.body: @@ -47,6 +53,7 @@ ;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> ;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) ;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED: for.body: define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { entry: @@ -75,6 +82,109 @@ ret void } +; Exactly the same scenario except we are now optimizing for size, therefore +; we check that no scalar epilogue is created. Since we can't create an epilog +; the interleave-group is invalidated because is has gaps, so we end up +; scalarizing. +; (Before the fix that this test checks, we used to create an epilogue despite +; optsize, and vectorized the access as an interleaved-group. This is now fixed, +; and we make sure that a scalar epilogue does not exist). + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: + +define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.09, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.09, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.09, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; Same, but the load/store are not predicated. The interleave-group is +; invalidated here as well because we have gaps and we can't create an epilog. +; The access is thus scalarized. +; (Before the fix that this test checks, we used to create an epilogue despite +; optsize, and vectorized the access as an interleaved-group. This is now fixed, +; and we make sure that a scalar epilogue does not exist). +; Since enable-masked-interleaved-accesses currently only affects predicated +; accesses, the behavior is the same with this switch set/unset. + + +; void unconditional_strided1_optsize(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } + +;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 +;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0 +;DISABLED_MASKED_STRIDED-NOT: for.body: +;DISABLED_MASKED_STRIDED: for.end: + +;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 +;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0 +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: + +define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { +entry: + br label %for.body + +for.body: + %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %mul = shl nuw nsw i32 %ix.06, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06 + store i8 %0, i8* %arrayidx1, align 1 + %inc = add nuw nsw i32 %ix.06, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + ; Check also a scenario with full interleave-groups (no gaps) as well as both ; load and store groups. We check that when masked-interleave-group is disabled ; the predicated loads (and stores) are not vectorized as an