Index: ../include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- ../include/llvm/Analysis/TargetTransformInfo.h +++ ../include/llvm/Analysis/TargetTransformInfo.h @@ -463,7 +463,10 @@ SK_Reverse, ///< Reverse the order of the vector. SK_Alternate, ///< Choose alternate elements from vector. SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. - SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset. + SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset. + SK_MergeTwoSrc, ///< Merge two vectors into one. + SK_PermuteOneSrc ///< Shuffle elements of one source vector with any + ///< shuffle mask. }; /// \brief Additional information about an operand's possible values. Index: ../include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- ../include/llvm/CodeGen/BasicTTIImpl.h +++ ../include/llvm/CodeGen/BasicTTIImpl.h @@ -60,8 +60,9 @@ return Cost; } - /// Estimate the cost overhead of SK_Alternate shuffle. - unsigned getAltShuffleOverhead(Type *Ty) { + /// Estimate the cost overhead of shuffle as a sequence of extract and + /// insert operations. + unsigned getAllPermutationsShuffleOverhead(Type *Ty) { assert(Ty->isVectorTy() && "Can only shuffle vectors"); unsigned Cost = 0; // Shuffle cost is equal to the cost of extracting element from its argument @@ -351,8 +352,9 @@ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Alternate) { - return getAltShuffleOverhead(Tp); + if (Kind == TTI::SK_Alternate || Kind == TTI::SK_MergeTwoSrc || + Kind == TTI::SK_PermuteOneSrc) { + return getAllPermutationsShuffleOverhead(Tp); } return 1; } Index: ../lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.h +++ ../lib/Target/X86/X86TargetTransformInfo.h @@ -80,6 +80,13 @@ int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace); + int getIntImmCost(int64_t); int getIntImmCost(const APInt &Imm, Type *Ty); Index: ../lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- ../lib/Target/X86/X86TargetTransformInfo.cpp +++ ../lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,9 +598,6 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -700,9 +697,8 @@ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - } - if (Kind == TTI::SK_Alternate) { + } else if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -792,7 +788,101 @@ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + + } else if (Kind == TTI::SK_MergeTwoSrc) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermt2b + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 }, // vpermt2b + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // vpermt2b + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermt2w + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermt2w + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // vpermt2w + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 3 }, // zext + vpermt2w + trunc + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 19 }, // 6 * v32i8 + 1 + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 3 } // zext + vpermt2w + trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermt2pd + { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermt2ps + { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermt2q + { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermt2d + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermt2pd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermt2ps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermt2q + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermt2d + { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // vpermt2pd + { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // vpermt2ps + { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // vpermt2q + { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 } // vpermt2d + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + } else if (Kind == TTI::SK_PermuteOneSrc) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermb + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpermb + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermw + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // vpermw + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 8 }, // extend to v32i16 + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 3 } // vpermw + zext/trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermpd + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermpd + { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // vpermpd + { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermps + { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // vpermps + { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermq + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermq + { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // vpermq + { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermd + { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // vpermd + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); @@ -2013,3 +2103,104 @@ // As a temporary solution, disable on Atom. return !(ST->isAtom() || ST->isSLM()); } + +// Get estimation for interleaved load/store operations and strided load. +// \p Indices contains indices for strided load. +// \p Factor - the factor of interleaving. +// AVX-512 provides 3-src shuffles that significantly reduces the cost. +int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // VecTy for interleave memop is . + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize -1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + if (Opcode == Instruction::Load) { + // Kind of shuffle depends on number of loaded values. + // If we load the entire data in one register, we can use a 1-src shuffle. + // Otherwise, we'll merge 2 sources in each operation. + TTI::ShuffleKind ShuffleKind = + (NumOfMemOps > 1) ? TTI::SK_MergeTwoSrc : TTI::SK_PermuteOneSrc; + unsigned ShuffleCost = getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + + unsigned NumOfResults = Indices.size() ? Indices.size() : Factor; + + // About a half of the loads may be folded in shuffles when we have only + // one result. + unsigned NumOfUnfoldedLoads = + NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + + // Get a number of shuffle operations per result. + unsigned NumOfShufflesPerResult = + std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // When we have more than one destination, we need additional instructions + // to keep sources. + unsigned NumOfMoves = 0; + if (NumOfResults > 1 && ShuffleKind == TTI::SK_MergeTwoSrc) + NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; + + int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; + + return Cost; + } + + // Store. + // There is no strided stores meanwhile. And store can't be folded in + // shuffle. + unsigned NumOfSources = Factor; // The number of values to be merged. + unsigned ShuffleCost = getShuffleCost(TTI::SK_MergeTwoSrc, SingleMemOpTy, + 0, nullptr); + unsigned NumOfShufflesPerStore = NumOfSources - 1; + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // We need additional instructions to keep sources. + unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; + int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + NumOfMoves; + return Cost; +} + +int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + auto isSupportedOnAVX512 = [](Type *VecTy, bool& RequiresBW) { + RequiresBW = false; + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || + EltTy->isIntegerTy(32) || EltTy->isPointerTy()) + return true; + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) { + RequiresBW = true; + return true; + } + return false; + }; + bool RequiresBW; + bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); + if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) + return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + +} Index: ../test/Analysis/CostModel/X86/interleave-load-i32.ll =================================================================== --- ../test/Analysis/CostModel/X86/interleave-load-i32.ll +++ ../test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i32_interleave4() { +;CHECK-LABEL: load_i32_interleave4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load +;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load +;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load +;CHECK: Found an estimated cost of 22 for VF 16 For instruction: %0 = load +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 16 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %3 = or i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3 + %4 = load i32, i32* %arrayidx6, align 8 + %add7 = add nsw i32 %add3, %4 + %5 = or i64 %indvars.iv, 3 + %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5 + %6 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add7, %6 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %add11, i32* %arrayidx13, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +define void @load_i32_interleave5() { +;CHECK-LABEL: load_i32_interleave5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load +;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load +;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load +;CHECK: Found an estimated cost of 35 for VF 16 For instruction: %0 = load +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = add nuw nsw i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3 + %4 = load i32, i32* %arrayidx6, align 4 + %add7 = add nsw i32 %add3, %4 + %5 = add nuw nsw i64 %indvars.iv, 3 + %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5 + %6 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add7, %6 + %7 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx14 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %7 + %8 = load i32, i32* %arrayidx14, align 4 + %add15 = add nsw i32 %add11, %8 + %arrayidx17 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %add15, i32* %arrayidx17, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: ../test/Analysis/CostModel/X86/interleave-store-i32.ll =================================================================== --- ../test/Analysis/CostModel/X86/interleave-store-i32.ll +++ ../test/Analysis/CostModel/X86/interleave-store-i32.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @store_i32_interleave4() { +;CHECK-LABEL: store_i32_interleave4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 5 for VF 4 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 11 for VF 8 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 22 for VF 16 For instruction: store i32 %add16 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 16 + %add = add nsw i32 %0, 1 + %1 = or i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1 + store i32 %add, i32* %arrayidx7, align 4 + %add10 = add nsw i32 %0, 2 + %2 = or i64 %indvars.iv, 2 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2 + store i32 %add10, i32* %arrayidx13, align 8 + %add16 = add nsw i32 %0, 3 + %3 = or i64 %indvars.iv, 3 + %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3 + store i32 %add16, i32* %arrayidx19, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +define void @store_i32_interleave5() { +;CHECK-LABEL: store_i32_interleave5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 7 for VF 2 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 14 for VF 4 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 21 for VF 8 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 35 for VF 16 For instruction: store i32 %add22 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %add = add nsw i32 %0, 1 + %1 = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1 + store i32 %add, i32* %arrayidx7, align 4 + %add10 = add nsw i32 %0, 2 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2 + store i32 %add10, i32* %arrayidx13, align 4 + %add16 = add nsw i32 %0, 3 + %3 = add nuw nsw i64 %indvars.iv, 3 + %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3 + store i32 %add16, i32* %arrayidx19, align 4 + %add22 = add nsw i32 %0, 4 + %4 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx25 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %4 + store i32 %add22, i32* %arrayidx25, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: ../test/Analysis/CostModel/X86/strided-load-i16.ll =================================================================== --- ../test/Analysis/CostModel/X86/strided-load-i16.ll +++ ../test/Analysis/CostModel/X86/strided-load-i16.ll @@ -0,0 +1,113 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i16] zeroinitializer, align 16 +@B = global [10240 x i16] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i16_stride2() { +;CHECK-LABEL: load_i16_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride3() { +;CHECK-LABEL: load_i16_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride4() { +;CHECK-LABEL: load_i16_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i16_stride5() { +;CHECK-LABEL: load_i16_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 6 for VF 32 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0 + %1 = load i16, i16* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv + store i16 %1, i16* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: ../test/Analysis/CostModel/X86/strided-load-i32.ll =================================================================== --- ../test/Analysis/CostModel/X86/strided-load-i32.ll +++ ../test/Analysis/CostModel/X86/strided-load-i32.ll @@ -0,0 +1,110 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i32] zeroinitializer, align 16 +@B = global [10240 x i32] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_int_stride2() { +;CHECK-LABEL: load_int_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride3() { +;CHECK-LABEL: load_int_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride4() { +;CHECK-LABEL: load_int_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_int_stride5() { +;CHECK-LABEL: load_int_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 6 for VF 16 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv + store i32 %1, i32* %arrayidx2, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + Index: ../test/Analysis/CostModel/X86/strided-load-i64.ll =================================================================== --- ../test/Analysis/CostModel/X86/strided-load-i64.ll +++ ../test/Analysis/CostModel/X86/strided-load-i64.ll @@ -0,0 +1,81 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i64] zeroinitializer, align 16 +@B = global [10240 x i64] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i64_stride2() { +;CHECK-LABEL: load_i64_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i64_stride3() { +;CHECK-LABEL: load_i64_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i64_stride4() { +;CHECK-LABEL: load_i64_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 5 for VF 8 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 4 + %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0 + %1 = load i64, i64* %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %1, i64* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: ../test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- ../test/Analysis/CostModel/X86/strided-load-i8.ll +++ ../test/Analysis/CostModel/X86/strided-load-i8.ll @@ -0,0 +1,117 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = global [10240 x i8] zeroinitializer, align 16 +@B = global [10240 x i8] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define void @load_i8_stride2() { +;CHECK-LABEL: load_i8_stride2 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride3() { +;CHECK-LABEL: load_i8_stride3 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride4() { +;CHECK-LABEL: load_i8_stride4 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 59 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @load_i8_stride5() { +;CHECK-LABEL: load_i8_stride5 +;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK: Found an estimated cost of 3 for VF 4 For instruction: %1 = load +;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %1 = load +;CHECK: Found an estimated cost of 20 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 39 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 78 for VF 64 For instruction: %1 = load +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = mul nsw i64 %indvars.iv, 5 + %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0 + %1 = load i8, i8* %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv + store i8 %1, i8* %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}