Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6017,8 +6017,14 @@ return NoVectorization; // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredication(OrigLoop->getHeader())) + if (CM.blockNeedsPredication(OrigLoop->getHeader()) && + !useMaskedInterleavedAccesses(*TTI)) { + LLVM_DEBUG( + dbgs() + << "LV: Invalidate all interleaved groups due to fold-tail by masking " + "which requires masked-interleaved support.\n"); CM.InterleaveInfo.reset(); + } if (UserVF) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Index: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -132,6 +132,106 @@ ret void } + +; Accesses with gaps under Optsize scenario again, with unknown trip-count +; this time, in order to check the behavior of folding-the-tail (folding the +; remainder loop into the main loop using masking) together with interleaved- +; groups. +; When masked-interleave-group is disabled the interleave-groups will be +; invalidated during Legality checks; +; When masked-interleave-group is enabled the interleave-groups will be +; invalidated during cost-model checks, because we don't have a way to support +; interleave-groups with gaps that require an epilogue using masking. +; So in both cases we check for no epilogue and scalarized conditional accesses. + +; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard, +; int n) { +; for(ix=0; ix < n; ++ix) { +; if (ix > guard) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } +; } + +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void + + +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NOT: for.body: +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void + +define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.010, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.010, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.010, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + ; Same, but the load/store are not predicated. The interleave-group is ; invalidated here as well because we have gaps and we can't create an epilog. ; The access is thus scalarized. @@ -185,6 +285,86 @@ } + +; Unconditioal accesses with gaps under Optsize scenario again, with unknown +; trip-count this time, in order to check the behavior of folding-the-tail +; (folding the remainder loop into the main loop using masking) together with +; interleaved-groups. +; The interleave-groups will be invalidated during cost-model checks, because +; we don't have a way to support interleave-groups with gaps that require an +; epilogue using masking (even when interleaved-masking is enabled; this +; is not yet supported). +; So we check for no epilogue and for scalarized conditional accesses. + +; for(ix=0; ix < n; ++ix) { +; char t = p[2*ix]; +; q[ix] = t; +; } + +; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISBLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void + + +; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NOT: for.body: +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void + +define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = shl nuw nsw i32 %ix.07, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07 + store i8 %0, i8* %arrayidx1, align 1 + %inc = add nuw nsw i32 %ix.07, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + ; Check also a scenario with full interleave-groups (no gaps) as well as both ; load and store groups. We check that when masked-interleave-group is disabled ; the predicated loads (and stores) are not vectorized as an @@ -272,3 +452,210 @@ for.end: ret void } + +; Full groups again, this time checking an Optsize scenario, with unknown trip- +; count, to check the behavior of folding-the-tail (folding the remainder loop +; into the main loop using masking) together with interleaved-groups. +; When masked-interleave-group is disabled the interleave-groups will be +; invalidated during Legality check, so nothing to check here. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The mask itself is an And of two masks: one that masks away the remainder +; iterations, and one that masks away the 'else' of the 'if' statement. +; +; void masked_strided2_unknown_tc(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard, +; int n) { +; for(ix=0; ix < n; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc( +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or i32 [[TMP1]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add <8 x i32> {{.*}}, +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]], +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void + +define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize { +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %cmp1 = icmp sgt i32 %ix.023, %guard + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.023, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx3, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx5, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx9, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.023, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Full groups under Optsize scenario again, with unknown trip-count, again in +; order to check the behavior of folding-the-tail (folding the remainder loop +; into the main loop using masking) together with interleaved-groups. +; This time the accesses are not conditional, they become conditional only +; due to tail folding. +; When masked-interleave-group is disabled the interleave-groups will be +; invalidated during cost-model checks, so we check for no epilogue and +; scalarized conditional accesses. +; When masked-interleave-group is enabled we check for no epilogue, +; and interleave-groups vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; (Same vectorization scheme as for the previous loop with conditional accesses +; except here the mask only masks away the remainder iterations.) +; +; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p, +; unsigned char* restrict q, +; int n) { +; for(ix=0; ix < n; ++ix) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +;} + +; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc( +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void + + + +; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc( +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP0]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]] +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void + +define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { +entry: + %cmp20 = icmp sgt i32 %n, 0 + br i1 %cmp20, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = shl nuw nsw i32 %ix.021, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx2, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx4, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx8, align 1 + %inc = add nuw nsw i32 %ix.021, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} +