Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -93,6 +93,9 @@ int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace); int getIntImmCost(int64_t); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2245,6 +2245,114 @@ return !(ST->isAtom()); } +// Get estimation for interleaved load/store operations for AVX2. +// \p Factor is the interleaved-access factor (stride) - number of +// (interleaved) elements in the group. +// \p Indices contains the indices for a strided load: when the +// interleaved load has gaps they indicate which elements are used. +// If Indices is empty (or if the number of indices is equal to the size +// of the interleaved-access as given in \p Factor) the access has no gaps. +// +// As opposed to AVX-512, AVX2 does not have generic shuffles that allow +// computing the cost using a generic formula as a function of generic +// shuffles. We therefore use a lookup table instead, filled according to +// the instruction sequences that codegen currently generates. +int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // We currently Support only fully-interleaved groups, with no gaps. + // TODO: Support also strided loads (interleaved-groups with gaps). + if (Indices.size() && Indices.size() != Factor) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + // VecTy for interleave memop is . + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + + // This function can be called with VecTy=<6xi128>, Factor=3, in which case + // the VF=2, while v2i128 is an unsupported MVT vector type + // (see MachineValueType.h::getVectorVT()). + if (!LegalVT.isVector()) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + unsigned VF = VecTy->getVectorNumElements() / Factor; + Type *ScalarTy = VecTy->getVectorElementType(); + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = + getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + VectorType *VT = VectorType::get(ScalarTy, VF); + EVT ETy = TLI->getValueType(DL, VT); + if (!ETy.isSimple()) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + + // TODO: Complete for other data-types and strides. + // Each combination of Stride, ElementTy and VF results in a different + // sequence; The cost tables are therefore accessed with: + // Factor (stride) and VectorType=VFxElemType. + // The Cost accounts only for the shuffle sequence; + // The cost of the loads/stores is accounted for separately. + // + static const CostTblEntry AVX2InterleavedLoadTbl[] = { + { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 + { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 + { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 + { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8 + { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8 + + { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 + { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 + { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 + { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 + { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8 + }; + + static const CostTblEntry AVX2InterleavedStoreTbl[] = { + { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) + { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) + { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) + { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store) + { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store) + + { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) + { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) + { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store) + { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store) + { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store) + }; + + if (Opcode == Instruction::Load) { + if (const auto *Entry = + CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) + return NumOfMemOps * MemOpCost + Entry->Cost; + } else { + assert(Opcode == Instruction::Store && + "Expected Store Instruction at this point"); + if (const auto *Entry = + CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) + return NumOfMemOps * MemOpCost + Entry->Cost; + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} + // Get estimation for interleaved load/store operations and strided load. // \p Indices contains indices for strided load. // \p Factor - the factor of interleaving. @@ -2353,6 +2461,10 @@ if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); + if (ST->hasAVX2()) + return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } Index: llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -0,0 +1,98 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) { +;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8 +entry: + %cmp13 = icmp sgt i32 %Nels, 0 + br i1 %cmp13, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %Ptr.addr.016 = phi i8* [ %incdec.ptr2, %for.body ], [ %Ptr, %for.body.preheader ] + %i.015 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %s.014 = phi i32 [ %add6, %for.body ], [ 0, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 1 + %0 = load i8, i8* %Ptr.addr.016, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 2 + %1 = load i8, i8* %incdec.ptr, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 3 + %2 = load i8, i8* %incdec.ptr1, align 1 + %conv = zext i8 %0 to i32 + %conv3 = zext i8 %1 to i32 + %conv4 = zext i8 %2 to i32 + %add = add i32 %s.014, %conv + %add5 = add i32 %add, %conv3 + %add6 = add i32 %add5, %conv4 + %inc = add nuw nsw i32 %i.015, 1 + %exitcond = icmp eq i32 %inc, %Nels + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %add6.lcssa = phi i32 [ %add6, %for.body ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %add6.lcssa, %for.end.loopexit ] + ret i32 %s.0.lcssa +} + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @doit_stride4(i8* nocapture readonly %Ptr, i32 %Nels) local_unnamed_addr { +;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 21 for VF 8 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 41 for VF 16 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 84 for VF 32 For instruction: %0 = load i8 +entry: + %cmp59 = icmp sgt i32 %Nels, 0 + br i1 %cmp59, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %Ptr.addr.062 = phi i8* [ %incdec.ptr3, %for.body ], [ %Ptr, %for.body.preheader ] + %i.061 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %s.060 = phi i32 [ %cond39, %for.body ], [ 0, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 1 + %0 = load i8, i8* %Ptr.addr.062, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 2 + %1 = load i8, i8* %incdec.ptr, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 3 + %2 = load i8, i8* %incdec.ptr1, align 1 + %incdec.ptr3 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 4 + %3 = load i8, i8* %incdec.ptr2, align 1 + %cmp5 = icmp ult i8 %0, %1 + %.sink = select i1 %cmp5, i8 %0, i8 %1 + %cmp12 = icmp ult i8 %.sink, %2 + %.sink40 = select i1 %cmp12, i8 %.sink, i8 %2 + %cmp23 = icmp ult i8 %.sink40, %3 + %.sink41 = select i1 %cmp23, i8 %.sink40, i8 %3 + %conv28 = zext i8 %.sink41 to i32 + %cmp33 = icmp slt i32 %s.060, %conv28 + %cond39 = select i1 %cmp33, i32 %s.060, i32 %conv28 + %inc = add nuw nsw i32 %i.061, 1 + %exitcond = icmp eq i32 %inc, %Nels + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %cond39.lcssa = phi i32 [ %cond39, %for.body ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %cond39.lcssa, %for.end.loopexit ] + ret i32 %s.0.lcssa +} Index: llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll +++ llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind uwtable +define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr { +;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4 +entry: + %cmp14 = icmp sgt i32 %Nels, 0 + br i1 %cmp14, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %conv = trunc i32 %Nels to i8 + %conv1 = shl i8 %conv, 1 + %conv4 = shl i8 %conv, 2 + br label %for.body + +for.body: + %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %Ptr.addr.015 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr5, %for.body ] + %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 1 + store i8 %conv, i8* %Ptr.addr.015, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 2 + store i8 %conv1, i8* %incdec.ptr, align 1 + %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 3 + store i8 %conv4, i8* %incdec.ptr2, align 1 + %inc = add nuw nsw i32 %i.016, 1 + %exitcond = icmp eq i32 %inc, %Nels + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Function Attrs: norecurse nounwind uwtable +define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr { +;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7 +entry: + %cmp19 = icmp sgt i32 %Nels, 0 + br i1 %cmp19, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %conv = trunc i32 %Nels to i8 + %conv1 = shl i8 %conv, 1 + %conv4 = shl i8 %conv, 2 + %mul6 = mul nsw i32 %Nels, 5 + %conv7 = trunc i32 %mul6 to i8 + br label %for.body + +for.body: + %i.021 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %Ptr.addr.020 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr8, %for.body ] + %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 1 + store i8 %conv, i8* %Ptr.addr.020, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 2 + store i8 %conv1, i8* %incdec.ptr, align 1 + %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 3 + store i8 %conv4, i8* %incdec.ptr2, align 1 + %incdec.ptr8 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 4 + store i8 %conv7, i8* %incdec.ptr5, align 1 + %inc = add nuw nsw i32 %i.021, 1 + %exitcond = icmp eq i32 %inc, %Nels + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}