Index: llvm/trunk/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/trunk/lib/Analysis/VectorUtils.cpp +++ llvm/trunk/lib/Analysis/VectorUtils.cpp @@ -712,7 +712,9 @@ // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - if (isStrided(DesB.Stride)) { + // TODO: Ignore B if it is in a predicated block. This restriction can be + // relaxed in the future once we handle masked interleaved groups. + if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B Index: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll @@ -0,0 +1,60 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + +; This test checks the fix for PR39099. +; +; Check that the predicated load is not vectorized as an +; interleaved-group (which requires proper masking, currently unsupported) +; but rather as a scalarized accesses. +; (For SKX, Gather is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; +; void masked_strided(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } +; } + +;CHECK-LABEL: @masked_strided( +;CHECK: vector.body: +;CHECK-NEXT: %index = phi i32 +;CHECK-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;CHECK-NEXT: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;CHECK-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;CHECK-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;CHECK-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;CHECK-NOT: %[[WIDEVEC:.+]] = load <16 x i8>, <16 x i8>* %{{.*}}, align 1 +;CHECK-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> + +define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.09, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.09, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.09, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} Index: llvm/trunk/test/Transforms/LoopVectorize/pr39099.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/pr39099.ll +++ llvm/trunk/test/Transforms/LoopVectorize/pr39099.ll @@ -0,0 +1,42 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + +; Ensure that we don't create interleave groups for predicated +; strided accesses. + +; CHECK: LV: Checking a loop in "masked_strided" +; CHECK: LV: Analyzing interleaved accesses... +; CHECK-NOT: LV: Creating an interleave group + +define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.017 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.017, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.017, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %0, i8* %arrayidx4, align 1 + %sub = sub i8 0, %0 + %add = or i32 %mul, 1 + %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx8, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.017, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +}