diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5028,12 +5028,6 @@ Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); - // We currently Support only fully-interleaved groups, with no gaps. - // TODO: Support also strided loads (interleaved-groups with gaps). - if (Indices.size() && Indices.size() != Factor) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, CostKind); - // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have // VecTy = <12 x i32>. @@ -5249,12 +5243,16 @@ }; if (Opcode == Instruction::Load) { + // FIXME: if we have a partially-interleaved groups, with gaps, + // should we discount the not-demanded indicies? if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) return MemOpCosts + Entry->Cost; } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); + assert((!Indices.size() || Indices.size() == Factor) && + "Interleaved store only supports fully-interleaved groups."); if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) return MemOpCosts + Entry->Cost; diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 94 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 47 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 94 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 32 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 52 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 26 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 52 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -8,11 +9,75 @@ define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 { ; CHECK-LABEL: @foo( -; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64 +; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %0** +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %0** +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %0** +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %0** +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr %0*, %0** [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast %0** [[TMP19]] to <8 x %0*>* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr %0*, %0** [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast %0** [[TMP21]] to <8 x %0*>* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr %0*, %0** [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast %0** [[TMP23]] to <8 x %0*>* +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr %0*, %0** [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast %0** [[TMP25]] to <8 x %0*>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP20]], align 8 +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_VEC9:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP24]], align 8 +; CHECK-NEXT: [[WIDE_VEC10:%.*]] = load <8 x %0*>, <8 x %0*>* [[TMP26]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC]], <8 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC8]], <8 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC9]], <8 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <8 x %0*> [[WIDE_VEC10]], <8 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = bitcast i64* [[P2]] to %0** +; CHECK-NEXT: [[V:%.*]] = load %0*, %0** [[P3]], align 8 +; CHECK-NEXT: [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]] +; CHECK-NEXT: br i1 [[B]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %loop @@ -30,11 +95,75 @@ define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 { ; CHECK-LABEL: @bar( -; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64 +; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %1** +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %1** +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %1** +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %1** +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr %1*, %1** [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast %1** [[TMP19]] to <8 x %1*>* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr %1*, %1** [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast %1** [[TMP21]] to <8 x %1*>* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr %1*, %1** [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast %1** [[TMP23]] to <8 x %1*>* +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr %1*, %1** [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast %1** [[TMP25]] to <8 x %1*>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP20]], align 8 +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_VEC9:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP24]], align 8 +; CHECK-NEXT: [[WIDE_VEC10:%.*]] = load <8 x %1*>, <8 x %1*>* [[TMP26]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC]], <8 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC8]], <8 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC9]], <8 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <8 x %1*> [[WIDE_VEC10]], <8 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = bitcast i64* [[P2]] to %1** +; CHECK-NEXT: [[V:%.*]] = load %1*, %1** [[P3]], align 8 +; CHECK-NEXT: [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]] +; CHECK-NEXT: br i1 [[B]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %loop