diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5165,12 +5165,6 @@ // shuffles. We therefore use a lookup table instead, filled according to // the instruction sequences that codegen currently generates. - // We currently support only fully-interleaved groups, with no gaps. - // TODO: Support also strided loads (interleaved-groups with gaps). - if (Indices.size() && Indices.size() != Factor) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, CostKind); - // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have // VecTy = <12 x i32>. @@ -5386,6 +5380,8 @@ }; if (Opcode == Instruction::Load) { + // FIXME: if we have a partially-interleaved groups, with gaps, + // should we discount the not-demanded indicies? if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) @@ -5393,6 +5389,8 @@ } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); + assert((!Indices.size() || Indices.size() == Factor) && + "Interleaved store only supports fully-interleaved groups."); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 94 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 47 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 94 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 32 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll @@ -24,10 +24,10 @@ ; AVX1: LV: Found an estimated cost of 52 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 26 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 52 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -9,16 +10,17 @@ define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 { ; CHECK-LABEL: @foo( ; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; entry: br label %loop loop: %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] - %p.inc = getelementptr inbounds i64, i64* %p2, i64 2 + %p.inc = getelementptr inbounds i64, i64* %p2, i64 4 %p3 = bitcast i64* %p2 to %0** %v = load %0*, %0** %p3, align 8 %b = icmp eq i64* %p.inc, %p.last @@ -31,16 +33,17 @@ define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 { ; CHECK-LABEL: @bar( ; CHECK: vector.body: -; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; entry: br label %loop loop: %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] - %p.inc = getelementptr inbounds i64, i64* %p2, i64 2 + %p.inc = getelementptr inbounds i64, i64* %p2, i64 4 %p3 = bitcast i64* %p2 to %1** %v = load %1*, %1** %p3, align 8 %b = icmp eq i64* %p.inc, %p.last