Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -467,7 +467,11 @@ SK_Reverse, ///< Reverse the order of the vector. SK_Alternate, ///< Choose alternate elements from vector. SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. - SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset. + SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset. + SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one + ///< with any shuffle mask. + SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any + ///< shuffle mask. }; /// \brief Additional information about an operand's possible values. Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -60,8 +60,9 @@ return Cost; } - /// Estimate the cost overhead of SK_Alternate shuffle. - unsigned getAltShuffleOverhead(Type *Ty) { + /// Estimate a cost of shuffle as a sequence of extract and insert + /// operations. + unsigned getPermuteShuffleOverhead(Type *Ty) { assert(Ty->isVectorTy() && "Can only shuffle vectors"); unsigned Cost = 0; // Shuffle cost is equal to the cost of extracting element from its argument @@ -351,8 +352,9 @@ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Alternate) { - return getAltShuffleOverhead(Tp); + if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc || + Kind == TTI::SK_PermuteSingleSrc) { + return getPermuteShuffleOverhead(Tp); } return 1; } Index: llvm/trunk/lib/Analysis/CostModel.cpp =================================================================== --- llvm/trunk/lib/Analysis/CostModel.cpp +++ llvm/trunk/lib/Analysis/CostModel.cpp @@ -97,6 +97,27 @@ return true; } +static bool isSingleSourceVectorMask(ArrayRef Mask) { + bool Vec0 = false; + bool Vec1 = false; + for (unsigned i = 0, NumVecElts = Mask.size(); i < NumVecElts; ++i) { + if (Mask[i] >= 0) { + if ((unsigned)Mask[i] >= NumVecElts) + Vec1 = true; + else + Vec0 = true; + } + } + return !(Vec0 && Vec1); +} + +static bool isZeroEltBroadcastVectorMask(ArrayRef Mask) { + for (unsigned i = 0; i < Mask.size(); ++i) + if (Mask[i] > 0) + return false; + return true; +} + static bool isAlternateVectorMask(ArrayRef Mask) { bool isAlternate = true; unsigned MaskSize = Mask.size(); @@ -501,6 +522,17 @@ if (isAlternateVectorMask(Mask)) return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTypOp0, 0, nullptr); + + if (isZeroEltBroadcastVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); + + if (isSingleSourceVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTypOp0, 0, nullptr); + + return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + VecTypOp0, 0, nullptr); } return -1; Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -80,6 +80,13 @@ int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace); + int getIntImmCost(int64_t); int getIntImmCost(const APInt &Imm, Type *Ty); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,9 +598,6 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -700,9 +697,8 @@ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - } - if (Kind == TTI::SK_Alternate) { + } else if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -792,7 +788,132 @@ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + + } else if (Kind == TTI::SK_PermuteTwoSrc) { + // We assume that source and destination have the same vector type. + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + int NumOfDests = LT.first; + int NumOfShufflesPerDest = LT.first * 2 - 1; + int NumOfShuffles = NumOfDests * NumOfShufflesPerDest; + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1 + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + } else if (Kind == TTI::SK_PermuteSingleSrc) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + if (LT.first == 1) { + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + } else { + // We are going to permute multiple sources and the result will be in + // multiple destinations. Providing an accurate cost only for splits where + // the element type remains the same. + + MVT LegalVT = LT.second; + if (LegalVT.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits() && + LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { + + unsigned VecTySize = DL.getTypeStoreSize(Tp); + unsigned LegalVTSize = LegalVT.getStoreSize(); + // Number of source vectors after legalization: + unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; + // Number of destination vectors after legalization: + unsigned NumOfDests = LT.first; + + Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), + LegalVT.getVectorNumElements()); + + unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; + return NumOfShuffles * + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + } + } } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); @@ -1942,13 +2063,14 @@ // Vector-4 of gather/scatter instruction does not exist on KNL. // We can extend it to 8 elements, but zeroing upper bits of // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is - // better in the VariableMask case. + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction + // is better in the VariableMask case. if (VF == 2 || (VF == 4 && !ST->hasVLX())) Scalarize = true; if (Scalarize) - return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, + AddressSpace); return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } @@ -2013,3 +2135,115 @@ // As a temporary solution, disable on Atom. return !(ST->isAtom() || ST->isSLM()); } + +// Get estimation for interleaved load/store operations and strided load. +// \p Indices contains indices for strided load. +// \p Factor - the factor of interleaving. +// AVX-512 provides 3-src shuffles that significantly reduces the cost. +int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // VecTy for interleave memop is . + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = + getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + if (Opcode == Instruction::Load) { + // Kind of shuffle depends on number of loaded values. + // If we load the entire data in one register, we can use a 1-src shuffle. + // Otherwise, we'll merge 2 sources in each operation. + TTI::ShuffleKind ShuffleKind = + (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; + + unsigned ShuffleCost = + getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + + unsigned NumOfLoadsInInterleaveGrp = + Indices.size() ? Indices.size() : Factor; + Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / Factor); + unsigned NumOfResults = + getTLI()->getTypeLegalizationCost(DL, ResultTy).first * + NumOfLoadsInInterleaveGrp; + + // About a half of the loads may be folded in shuffles when we have only + // one result. If we have more than one result, we do not fold loads at all. + unsigned NumOfUnfoldedLoads = + NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + + // Get a number of shuffle operations per result. + unsigned NumOfShufflesPerResult = + std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // When we have more than one destination, we need additional instructions + // to keep sources. + unsigned NumOfMoves = 0; + if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) + NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; + + int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; + + return Cost; + } + + // Store. + assert(Opcode == Instruction::Store && + "Expected Store Instruction at this point"); + + // There is no strided stores meanwhile. And store can't be folded in + // shuffle. + unsigned NumOfSources = Factor; // The number of values to be merged. + unsigned ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + unsigned NumOfShufflesPerStore = NumOfSources - 1; + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // We need additional instructions to keep sources. + unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; + int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + NumOfMoves; + return Cost; +} + +int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) { + RequiresBW = false; + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || + EltTy->isIntegerTy(32) || EltTy->isPointerTy()) + return true; + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) { + RequiresBW = true; + return true; + } + return false; + }; + bool RequiresBW; + bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); + if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) + return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} Index: llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll +++ llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i32_interleave4() { +;CHECK-LABEL: load_i32_interleave4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load +;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load +;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load +;CHECK: Found an estimated cost of 22 for VF 16 For instruction: %0 = load +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 16 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %3 = or i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3 + %4 = load i32, i32* %arrayidx6, align 8 + %add7 = add nsw i32 %add3, %4 + %5 = or i64 %indvars.iv, 3 + %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5 + %6 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add7, %6 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %add11, i32* %arrayidx13, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +define void @load_i32_interleave5() { +;CHECK-LABEL: load_i32_interleave5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load +;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load +;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load +;CHECK: Found an estimated cost of 35 for VF 16 For instruction: %0 = load +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = add nuw nsw i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3 + %4 = load i32, i32* %arrayidx6, align 4 + %add7 = add nsw i32 %add3, %4 + %5 = add nuw nsw i64 %indvars.iv, 3 + %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5 + %6 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add7, %6 + %7 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx14 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %7 + %8 = load i32, i32* %arrayidx14, align 4 + %add15 = add nsw i32 %add11, %8 + %arrayidx17 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %add15, i32* %arrayidx17, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll +++ llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @store_i32_interleave4() { +;CHECK-LABEL: store_i32_interleave4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 5 for VF 4 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 11 for VF 8 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 22 for VF 16 For instruction: store i32 %add16 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 16 + %add = add nsw i32 %0, 1 + %1 = or i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1 + store i32 %add, i32* %arrayidx7, align 4 + %add10 = add nsw i32 %0, 2 + %2 = or i64 %indvars.iv, 2 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2 + store i32 %add10, i32* %arrayidx13, align 8 + %add16 = add nsw i32 %0, 3 + %3 = or i64 %indvars.iv, 3 + %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3 + store i32 %add16, i32* %arrayidx19, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +define void @store_i32_interleave5() { +;CHECK-LABEL: store_i32_interleave5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 7 for VF 2 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 14 for VF 4 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 21 for VF 8 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 35 for VF 16 For instruction: store i32 %add22 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %add = add nsw i32 %0, 1 + %1 = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1 + store i32 %add, i32* %arrayidx7, align 4 + %add10 = add nsw i32 %0, 2 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2 + store i32 %add10, i32* %arrayidx13, align 4 + %add16 = add nsw i32 %0, 3 + %3 = add nuw nsw i64 %indvars.iv, 3 + %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3 + store i32 %add16, i32* %arrayidx19, align 4 + %add22 = add nsw i32 %0, 4 + %4 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx25 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %4 + store i32 %add22, i32* %arrayidx25, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll +++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll @@ -12,19 +12,19 @@ ; CHECK-LABEL: 'test_vXf64' define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { - ; SSE: Unknown cost {{.*}} %V128 = shufflevector - ; AVX: Unknown cost {{.*}} %V128 = shufflevector - ; AVX512: Unknown cost {{.*}} %V128 = shufflevector + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V256 = shufflevector - ; AVX: Unknown cost {{.*}} %V256 = shufflevector - ; AVX512: Unknown cost {{.*}} %V256 = shufflevector + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V512 = shufflevector - ; AVX: Unknown cost {{.*}} %V512 = shufflevector - ; AVX512: Unknown cost {{.*}} %V512 = shufflevector + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer ret void Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll +++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll @@ -0,0 +1,94 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX + +; +; Verify the cost model for 1 src shuffles +; + +; SKX-LABEL: 'test_vXf64' +define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) { + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> + + ; SKX: cost of 2 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> + + ret void +} + +; SKX-LABEL: 'test_vXi64' +define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) { + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { + + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> + + ret void +} + +; CHECK-LABEL: 'test_vXi32' +define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024) { + + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> + + ; SKX: cost of 2 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) { + + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> + + ; SKX: cost of 2 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> + + ; SKX: cost of 3 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> + + ; SKX: cost of 8 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> + + ret void +} Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll +++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll @@ -0,0 +1,68 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX + +; +; Verify the cost model for 2 src shuffles +; + +; SKX-LABEL: 'test_vXf64' +define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) { + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> + + ; SKX: cost of 6 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) { + + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> + + ; SKX: cost of 6 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> + + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024, <8 x i16> %src128_1, <16 x i16> %src256_1, <32 x i16> %src512_1, <64 x i16> %src1024_1) { + + ; SKX: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> + + ; SKX: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> + + ; SKX: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> + + ; SKX: cost of 6 {{.*}} %V1024 = shufflevector + %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512, <16 x i8> %src128_1, <32 x i8> %src256_1, <64 x i8> %src512_1) { + ; SKX: cost of 3 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> + + ; SKX: cost of 3 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> + + ; SKX: cost of 19 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> + + ret void +} Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll +++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll @@ -0,0 +1,113 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i16] zeroinitializer, align 16 +@B = global [10240 x i16] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i16_stride2() { +;CHECK-LABEL: load_i16_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride3() { +;CHECK-LABEL: load_i16_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride4() { +;CHECK-LABEL: load_i16_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride5() { +;CHECK-LABEL: load_i16_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 6 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll +++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll @@ -0,0 +1,110 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_int_stride2() { +;CHECK-LABEL: load_int_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride3() { +;CHECK-LABEL: load_int_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride4() { +;CHECK-LABEL: load_int_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride5() { +;CHECK-LABEL: load_int_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 6 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll +++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll @@ -0,0 +1,81 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i64] zeroinitializer, align 16 +@B = global [10240 x i64] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i64_stride2() { +;CHECK-LABEL: load_i64_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i64_stride3() { +;CHECK-LABEL: load_i64_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i64_stride4() { +;CHECK-LABEL: load_i64_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 4 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll +++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll @@ -0,0 +1,117 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i8] zeroinitializer, align 16 +@B = global [10240 x i8] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i8_stride2() { +;CHECK-LABEL: load_i8_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride3() { +;CHECK-LABEL: load_i8_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride4() { +;CHECK-LABEL: load_i8_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 59 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride5() { +;CHECK-LABEL: load_i8_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 39 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 78 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}