diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1237,6 +1237,14 @@ // Then plus the cost of interleave operation. assert(Indices.size() <= Factor && "Interleaved memory op has too many members"); + + APInt DemandedLoadStoreElts = APInt::getNullValue(NumElts); + for (unsigned Index : Indices) { + assert(Index < Factor && "Invalid index for interleaved memory op"); + for (unsigned Elm = 0; Elm < NumSubElts; Elm++) + DemandedLoadStoreElts.setBit(Index + Elm * Factor); + } + if (Opcode == Instruction::Load) { // The interleave cost is similar to extract sub vectors' elements // from the wide vector, and insert them into sub vectors. @@ -1246,21 +1254,12 @@ // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. - for (unsigned Index : Indices) { - assert(Index < Factor && "Invalid index for interleaved memory op"); - - // Extract elements from loaded vector for each sub vector. - for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, - Index + Elm * Factor); - } - - InstructionCost InsSubCost = 0; - for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - InsSubCost += - thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm); - + InstructionCost InsSubCost = + getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false); Cost += Indices.size() * InsSubCost; + Cost += + thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, + /*Insert*/ false, /*Extract*/ true); } else { // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. @@ -1275,20 +1274,12 @@ // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. - InstructionCost ExtSubCost = 0; - for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement, - SubVT, Elm); + InstructionCost ExtSubCost = + getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); Cost += ExtSubCost * Indices.size(); - - for (unsigned Index : Indices) { - assert(Index < Factor && "Invalid index for interleaved memory op"); - - // Insert elements from loaded vector for each sub vector. - for (unsigned Elm = 0; Elm < NumSubElts; Elm++) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT, - Index + Elm * Factor); - } + Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, + /*Insert*/ true, + /*Extract*/ false); } if (!UseMaskForCond) @@ -1308,13 +1299,9 @@ // The cost is estimated as extract all mask elements from the <8xi1> mask // vector and insert them factor times into the <24xi1> shuffled mask // vector. - for (unsigned i = 0; i < NumSubElts; i++) - Cost += - thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); - - for (unsigned i = 0; i < NumElts; i++) - Cost += - thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); + Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); + Cost += + getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false); // The Gaps mask is invariant and created outside the loop, therefore the // cost of creating it is not accounted for here. However if we have both diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll @@ -22,8 +22,8 @@ ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll @@ -22,15 +22,15 @@ ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll @@ -22,8 +22,8 @@ ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 82 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -21,13 +21,13 @@ ; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll @@ -21,7 +21,7 @@ ; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2 -; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 +; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2 ; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll @@ -13,24 +13,24 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 160 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 -; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 -; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 +; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX2: LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, i8* %in0, align 1 -; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 +; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, i8* %in0, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, i8* %in0, align 1 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, i8* %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll @@ -21,9 +21,9 @@ ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, i16* %out1, align 2 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: store i16 %v1, i16* %out1, align 2 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v1, i16* %out1, align 2 -; AVX1: LV: Found an estimated cost of 49 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2 -; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2 -; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2 +; AVX1: LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %v1, i16* %out1, align 2 +; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction: store i16 %v1, i16* %out1, align 2 +; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction: store i16 %v1, i16* %out1, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, i16* %out1, align 2 ; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, i16* %out1, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll @@ -20,17 +20,17 @@ ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX2: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2 -; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX2: LV: Found an estimated cost of 30 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX2: LV: Found an estimated cost of 53 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2 +; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction: store i16 %v2, i16* %out2, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll @@ -20,10 +20,10 @@ ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, i16* %out3, align 2 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, i16* %out3, align 2 -; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2 -; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2 -; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2 -; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2 +; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v3, i16* %out3, align 2 +; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction: store i16 %v3, i16* %out3, align 2 +; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction: store i16 %v3, i16* %out3, align 2 +; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction: store i16 %v3, i16* %out3, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, i16* %out3, align 2 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, i16* %out3, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll @@ -18,16 +18,16 @@ ; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX1: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX1: LV: Found an estimated cost of 115 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX2: LV: Found an estimated cost of 58 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX2: LV: Found an estimated cost of 115 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 -; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX2: LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, i16* %out4, align 2 +; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, i16* %out4, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, i16* %out4, align 2 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: store i16 %v4, i16* %out4, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll @@ -18,10 +18,10 @@ ; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2 -; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2 -; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2 -; AVX1: LV: Found an estimated cost of 147 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2 -; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2 +; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2 +; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction: store i16 %v5, i16* %out5, align 2 +; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction: store i16 %v5, i16* %out5, align 2 +; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction: store i16 %v5, i16* %out5, align 2 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, i16* %out5, align 2 ; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, i16* %out5, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll @@ -13,24 +13,24 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 27 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 79 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 238 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 -; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX1: LV: Found an estimated cost of 97 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 +; AVX1: LV: Found an estimated cost of 67 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 +; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX2: LV: Found an estimated cost of 97 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 -; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 +; AVX2: LV: Found an estimated cost of 67 for VF 16 For instruction: store i8 %v1, i8* %out1, align 1 +; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction: store i8 %v1, i8* %out1, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, i8* %out1, align 1 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %v1, i8* %out1, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE -; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX -; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX1 +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE @@ -26,34 +26,63 @@ ; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 ; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; -; AVX-LABEL: @foo( -; AVX-NEXT: entry: -; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX: vector.ph: -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; AVX-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* -; AVX-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 -; AVX-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; AVX-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 -; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; AVX-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; AVX-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] -; AVX: middle.block: -; AVX-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; AVX: scalar.ph: -; AVX-NEXT: br label [[FOR_BODY:%.*]] -; AVX: for.cond.cleanup: -; AVX-NEXT: ret void -; AVX: for.body: -; AVX-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; AVX1-LABEL: @foo( +; AVX1-NEXT: entry: +; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX1: vector.ph: +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1: middle.block: +; AVX1-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; AVX1: scalar.ph: +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.cond.cleanup: +; AVX1-NEXT: ret void +; AVX1: for.body: +; AVX1-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; AVX2-LABEL: @foo( +; AVX2-NEXT: entry: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <16 x i32>* +; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP2]], align 4 +; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add nsw <8 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* [[TMP5]], align 4 +; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; AVX2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2: middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; AVX2: scalar.ph: +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.cond.cleanup: +; AVX2-NEXT: ret void +; AVX2: for.body: +; AVX2-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: br label %for.body