diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5205,6 +5205,7 @@ Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); // Get the cost of all the memory operations. + // FIXME: discount dead loads. InstructionCost MemOpCosts = getMemoryOpCost( Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); @@ -5424,22 +5425,25 @@ }; if (Opcode == Instruction::Load) { - // FIXME: if we have a partially-interleaved groups, with gaps, - // should we discount the not-demanded indicies? + auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), + MemOpCosts](const CostTblEntry *Entry) { + return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); + }; + if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return MemOpCosts + Entry->Cost; + return GetDiscountedCost(Entry); } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll @@ -13,24 +13,24 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; SSE2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 188 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 23 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 67 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 192 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll @@ -26,11 +26,11 @@ ; AVX1: LV: Found an estimated cost of 104 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 33 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll @@ -27,24 +27,42 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %0**> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %0**> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i64*> [[TMP9]] to <4 x %0**> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64*> [[TMP10]] to <4 x %0**> -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %0** +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %0** +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %0** +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %0** +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr %0*, %0** [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast %0** [[TMP19]] to <16 x %0*>* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr %0*, %0** [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast %0** [[TMP21]] to <16 x %0*>* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr %0*, %0** [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast %0** [[TMP23]] to <16 x %0*>* +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr %0*, %0** [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast %0** [[TMP25]] to <16 x %0*>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP20]], align 8 +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_VEC9:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP24]], align 8 +; CHECK-NEXT: [[WIDE_VEC10:%.*]] = load <16 x %0*>, <16 x %0*>* [[TMP26]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC]], <16 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC8]], <16 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC9]], <16 x %0*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <16 x %0*> [[WIDE_VEC10]], <16 x %0*> poison, <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 64 -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -95,24 +113,42 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP6]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %1**> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %1**> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i64*> [[TMP9]] to <4 x %1**> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64*> [[TMP10]] to <4 x %1**> -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[NEXT_GEP]] to %1** +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[NEXT_GEP5]] to %1** +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[NEXT_GEP6]] to %1** +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[NEXT_GEP7]] to %1** +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr %1*, %1** [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast %1** [[TMP19]] to <16 x %1*>* +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr %1*, %1** [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast %1** [[TMP21]] to <16 x %1*>* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr %1*, %1** [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast %1** [[TMP23]] to <16 x %1*>* +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr %1*, %1** [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast %1** [[TMP25]] to <16 x %1*>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP20]], align 8 +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_VEC9:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP24]], align 8 +; CHECK-NEXT: [[WIDE_VEC10:%.*]] = load <16 x %1*>, <16 x %1*>* [[TMP26]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC]], <16 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC8]], <16 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC9]], <16 x %1*> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <16 x %1*> [[WIDE_VEC10]], <16 x %1*> poison, <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 64 -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: