Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2617,8 +2617,8 @@ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 - { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8 - { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8 + { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 + { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 @@ -2631,14 +2631,14 @@ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) - { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store) - { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store) + { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) + { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) - { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store) - { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store) - { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store) + { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) + { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) + { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) }; if (Opcode == Instruction::Load) { @@ -2657,6 +2657,23 @@ Alignment, AddressSpace); } +static const CostTblEntry AVX512InterleavedLoadTbl[] = { + {3, MVT::v16i8, 13}, //(load 48i8 and) deinterleave into 3 x 16i8 + {3, MVT::v32i8, 16}, //(load 96i8 and) deinterleave into 3 x 32i8 + {3, MVT::v64i8, 25}, //(load 96i8 and) deinterleave into 3 x 32i8 +}; + +static const CostTblEntry AVX512InterleavedStoreTbl[] = { + {3, MVT::v16i8, 13}, // interleave 3 x 16i8 into 48i8 (and store) + {3, MVT::v32i8, 16}, // interleave 3 x 32i8 into 96i8 (and store) + {3, MVT::v64i8, 29}, // interleave 3 x 64i8 into 96i8 (and store) + + {4, MVT::v8i8, 11}, // interleave 4 x 8i8 into 32i8 (and store) + {4, MVT::v16i8, 12}, // interleave 4 x 16i8 into 64i8 (and store) + {4, MVT::v32i8, 16}, // interleave 4 x 32i8 into 128i8 (and store) + {4, MVT::v64i8, 28} // interleave 4 x 32i8 into 128i8 (and store) +}; + // Get estimation for interleaved load/store operations and strided load. // \p Indices contains indices for strided load. // \p Factor - the factor of interleaving. @@ -2684,7 +2701,13 @@ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + unsigned VF = VecTy->getVectorNumElements() / Factor; + MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); + if (Opcode == Instruction::Load) { + if (const auto *Entry = + CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) + return Entry->Cost; // Kind of shuffle depends on number of loaded values. // If we load the entire data in one register, we can use a 1-src shuffle. // Otherwise, we'll merge 2 sources in each operation. @@ -2728,6 +2751,10 @@ assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); + if (const auto *Entry = + CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) + return Entry->Cost; + // There is no strided stores meanwhile. And store can't be folded in // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8 entry: %cmp13 = icmp sgt i32 %Nels, 0 br i1 %cmp13, label %for.body.preheader, label %for.end Index: test/Analysis/CostModel/X86/interleaved-store-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-store-i8.ll +++ test/Analysis/CostModel/X86/interleaved-store-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4 entry: %cmp14 = icmp sgt i32 %Nels, 0 br i1 %cmp14, label %for.body.lr.ph, label %for.end @@ -47,9 +47,9 @@ ;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7 entry: %cmp19 = icmp sgt i32 %Nels, 0 br i1 %cmp19, label %for.body.lr.ph, label %for.end Index: test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/strided-load-i8.ll +++ test/Analysis/CostModel/X86/strided-load-i8.ll @@ -41,9 +41,9 @@ ;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load ;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load ;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load +;CHECK: Found an estimated cost of 13 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 16 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 25 for VF 64 For instruction: %1 = load entry: br label %for.body