diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2393,10 +2393,11 @@ }; assert(Val->isVectorTy() && "This must be a vector type"); - Type *ScalarType = Val->getScalarType(); + int RegisterFileMoveCost = 0; - if (Index != -1U) { + if (Index != -1U && (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Val); @@ -2405,17 +2406,31 @@ return 0; // The type may be split. Normalize the index to the new type. + unsigned NumSubVecs = 1; unsigned Width = LT.second.getVectorNumElements(); + unsigned SubWidth = Width; Index = Index % Width; + // For >128-bit vectors, we need to extract higher 128-bit subvectors. + // For INSERT_VECTOR_ELT, we also need to insert the subvector back. + if (LT.second.getSizeInBits() > 128) { + assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); + NumSubVecs = LT.second.getSizeInBits() / 128; + SubWidth = Width / NumSubVecs; + if (SubWidth <= Index) { + RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); + Index %= SubWidth; + } + } + if (Index == 0) { // Floating point scalars are already located in index #0. if (ScalarType->isFloatingPointTy()) - return 0; + return RegisterFileMoveCost; // Assume movd/movq XMM <-> GPR is relatively cheap on all targets. if (ScalarType->isIntegerTy()) - return 1; + return 1 + RegisterFileMoveCost; } int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -2423,14 +2438,30 @@ MVT MScalarTy = LT.second.getScalarType(); if (ST->isSLM()) if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) - return LT.first * Entry->Cost; + return Entry->Cost + RegisterFileMoveCost; + + // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. + if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || + (MScalarTy.isInteger() && ST->hasSSE41())) + return 1 + RegisterFileMoveCost; + + // For extractions we need to shuffle the element to index 0. + // For insertions we need to shuffle the elements to its destination. + // In both cases we must handle the subvectors move(s). + // TODO: Under what circumstances should we shuffle using the full width? + Type *SubVecTy = VectorType::get(Val->getVectorElementType(), SubWidth); + int ShuffleCost = getShuffleCost(Opcode == Instruction::InsertElement + ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + SubVecTy, 0, SubVecTy); + int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; + return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; } // Add to the base cost if we know that the extracted element of a vector is // destined to be moved to and used in the integer register file. - int RegisterFileMoveCost = 0; if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) - RegisterFileMoveCost = 1; + RegisterFileMoveCost += 1; return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll --- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll @@ -684,9 +684,9 @@ define i32 @frem(i32 %arg) { ; SSE1-LABEL: 'frem' ; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -695,9 +695,9 @@ ; ; SSE2-LABEL: 'frem' ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -706,9 +706,9 @@ ; ; SSE42-LABEL: 'frem' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -717,31 +717,31 @@ ; ; AVX-LABEL: 'frem' ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'frem' ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'frem' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -750,9 +750,9 @@ ; ; GLM-LABEL: 'frem' ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -761,13 +761,13 @@ ; ; BTVER2-LABEL: 'frem' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef -; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = frem <16 x float> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8F32 = frem <8 x float> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef -; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F32 = frem float undef, undef @@ -1103,9 +1103,9 @@ define i32 @fma(i32 %arg) { ; SSE1-LABEL: 'fma' ; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SSE1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SSE1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SSE1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; SSE1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; SSE1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; SSE1-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; SSE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; SSE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) @@ -1114,9 +1114,9 @@ ; ; SSE2-LABEL: 'fma' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) @@ -1125,9 +1125,9 @@ ; ; SSE42-LABEL: 'fma' ; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) @@ -1136,13 +1136,13 @@ ; ; AVX-LABEL: 'fma' ; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'fma' @@ -1158,9 +1158,9 @@ ; ; SLM-LABEL: 'fma' ; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) @@ -1169,9 +1169,9 @@ ; ; GLM-LABEL: 'fma' ; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) @@ -1180,13 +1180,13 @@ ; ; BTVER2-LABEL: 'fma' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll --- a/llvm/test/Analysis/CostModel/X86/extend.ll +++ b/llvm/test/Analysis/CostModel/X86/extend.ll @@ -374,8 +374,8 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> -; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> -; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> @@ -399,8 +399,8 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> -; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> -; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> @@ -414,7 +414,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> @@ -424,8 +424,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> @@ -439,7 +439,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> @@ -474,8 +474,8 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> @@ -877,8 +877,8 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> -; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> -; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> @@ -902,8 +902,8 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> -; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> -; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8 ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> @@ -917,7 +917,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> @@ -927,8 +927,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> @@ -942,7 +942,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> @@ -977,8 +977,8 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> diff --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll --- a/llvm/test/Analysis/CostModel/X86/fptosi.ll +++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll @@ -11,25 +11,32 @@ ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 define i32 @fptosi_double_i64(i32 %arg) { -; SSE-LABEL: 'fptosi_double_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptosi_double_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptosi_double_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_double_i64' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 ; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptosi_double_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'fptosi_double_i64' @@ -49,8 +56,8 @@ ; BTVER2-LABEL: 'fptosi_double_i64' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptosi double undef to i64 @@ -147,12 +154,19 @@ } define i32 @fptosi_double_i8(i32 %arg) { -; SSE-LABEL: 'fptosi_double_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptosi_double_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptosi_double_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_double_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 @@ -190,28 +204,36 @@ } define i32 @fptosi_float_i64(i32 %arg) { -; SSE-LABEL: 'fptosi_float_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptosi_float_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptosi_float_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_float_i64' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 ; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptosi_float_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'fptosi_float_i64' @@ -233,9 +255,9 @@ ; BTVER2-LABEL: 'fptosi_float_i64' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptosi float undef to i64 @@ -319,12 +341,19 @@ } define i32 @fptosi_float_i8(i32 %arg) { -; SSE-LABEL: 'fptosi_float_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptosi_float_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 327 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptosi_float_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptosi_float_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll --- a/llvm/test/Analysis/CostModel/X86/fptoui.ll +++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll @@ -11,25 +11,32 @@ ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 define i32 @fptoui_double_i64(i32 %arg) { -; SSE-LABEL: 'fptoui_double_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_double_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_double_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_double_i64' ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptoui_double_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'fptoui_double_i64' @@ -49,8 +56,8 @@ ; BTVER2-LABEL: 'fptoui_double_i64' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptoui double undef to i64 @@ -61,12 +68,19 @@ } define i32 @fptoui_double_i32(i32 %arg) { -; SSE-LABEL: 'fptoui_double_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_double_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_double_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_double_i32' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 @@ -147,12 +161,19 @@ } define i32 @fptoui_double_i8(i32 %arg) { -; SSE-LABEL: 'fptoui_double_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_double_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_double_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_double_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 @@ -190,28 +211,36 @@ } define i32 @fptoui_float_i64(i32 %arg) { -; SSE-LABEL: 'fptoui_float_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_float_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_float_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_float_i64' ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptoui_float_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'fptoui_float_i64' @@ -233,9 +262,9 @@ ; BTVER2-LABEL: 'fptoui_float_i64' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptoui float undef to i64 @@ -247,12 +276,19 @@ } define i32 @fptoui_float_i32(i32 %arg) { -; SSE-LABEL: 'fptoui_float_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_float_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_float_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_float_i32' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 @@ -333,12 +369,19 @@ } define i32 @fptoui_float_i8(i32 %arg) { -; SSE-LABEL: 'fptoui_float_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'fptoui_float_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 327 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'fptoui_float_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_float_i8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 diff --git a/llvm/test/Analysis/CostModel/X86/fround.ll b/llvm/test/Analysis/CostModel/X86/fround.ll --- a/llvm/test/Analysis/CostModel/X86/fround.ll +++ b/llvm/test/Analysis/CostModel/X86/fround.ll @@ -16,9 +16,9 @@ define i32 @ceil(i32 %arg) { ; SSE2-LABEL: 'ceil' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.ceil.f32(float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.ceil.f64(double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.ceil.v2f64(<2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.ceil.v4f64(<4 x double> undef) @@ -107,9 +107,9 @@ define i32 @floor(i32 %arg) { ; SSE2-LABEL: 'floor' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.floor.f32(float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.floor.f64(double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.floor.v2f64(<2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.floor.v4f64(<4 x double> undef) @@ -198,9 +198,9 @@ define i32 @nearbyint(i32 %arg) { ; SSE2-LABEL: 'nearbyint' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.nearbyint.f64(double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef) @@ -289,9 +289,9 @@ define i32 @rint(i32 %arg) { ; SSE2-LABEL: 'rint' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef) @@ -380,9 +380,9 @@ define i32 @trunc(i32 %arg) { ; SSE2-LABEL: 'trunc' ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.trunc.f32(float undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.trunc.f64(double undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.trunc.v2f64(<2 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.trunc.v4f64(<4 x double> undef) diff --git a/llvm/test/Analysis/CostModel/X86/insert-extract-at-zero.ll b/llvm/test/Analysis/CostModel/X86/insert-extract-at-zero.ll --- a/llvm/test/Analysis/CostModel/X86/insert-extract-at-zero.ll +++ b/llvm/test/Analysis/CostModel/X86/insert-extract-at-zero.ll @@ -1,39 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" define i32 @insert-extract-at-zero-idx(i32 %arg, float %fl) { - ;CHECK: cost of 0 {{.*}} extract +; CHECK-LABEL: 'insert-extract-at-zero-idx' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %A = extractelement <4 x float> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = extractelement <4 x i32> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = extractelement <4 x float> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %D = extractelement <8 x float> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = extractelement <8 x float> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F = extractelement <8 x float> undef, i32 %arg +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %G = insertelement <4 x float> undef, float %fl, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %H = insertelement <4 x float> undef, float %fl, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I = insertelement <4 x i32> undef, i32 %arg, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %J = insertelement <4 x double> undef, double undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %K = insertelement <8 x double> undef, double undef, i32 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %L = insertelement <16 x double> undef, double undef, i32 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %M = insertelement <16 x double> undef, double undef, i32 9 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; %A = extractelement <4 x float> undef, i32 0 - ;CHECK: cost of 1 {{.*}} extract %B = extractelement <4 x i32> undef, i32 0 - ;CHECK: cost of 1 {{.*}} extract %C = extractelement <4 x float> undef, i32 1 - ;CHECK: cost of 0 {{.*}} extract %D = extractelement <8 x float> undef, i32 0 - ;CHECK: cost of 1 {{.*}} extract %E = extractelement <8 x float> undef, i32 1 - ;CHECK: cost of 1 {{.*}} extract %F = extractelement <8 x float> undef, i32 %arg - ;CHECK: cost of 0 {{.*}} insert %G = insertelement <4 x float> undef, float %fl, i32 0 - ;CHECK: cost of 1 {{.*}} insert %H = insertelement <4 x float> undef, float %fl, i32 1 - ;CHECK: cost of 1 {{.*}} insert %I = insertelement <4 x i32> undef, i32 %arg, i32 0 - ;CHECK: cost of 0 {{.*}} insert %J = insertelement <4 x double> undef, double undef, i32 0 - ;CHECK: cost of 0 {{.*}} insert %K = insertelement <8 x double> undef, double undef, i32 4 - ;CHECK: cost of 0 {{.*}} insert %L = insertelement <16 x double> undef, double undef, i32 8 - ;CHECK: cost of 1 {{.*}} insert %M = insertelement <16 x double> undef, double undef, i32 9 ret i32 0 } diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll @@ -1,9 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck %s -check-prefix=CORE2 ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -cost-model -analyze < %s | FileCheck %s -check-prefix=COREI7 ; If SSE4.1 roundps instruction is available it is cheap to lower, otherwise ; it'll be scalarized into calls which are expensive. define void @test1(float* nocapture %f) nounwind { +; CORE2-LABEL: 'test1' +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; COREI7-LABEL: 'test1' +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; vector.ph: br label %vector.body @@ -21,17 +48,39 @@ for.end: ; preds = %vector.body ret void -; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1': -; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) -; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1': -; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) } declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone define void @test2(float* nocapture %f) nounwind { +; CORE2-LABEL: 'test2' +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; COREI7-LABEL: 'test2' +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; vector.ph: br label %vector.body @@ -49,17 +98,39 @@ for.end: ; preds = %vector.body ret void -; CORE2: Printing analysis 'Cost Model Analysis' for function 'test2': -; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) -; COREI7: Printing analysis 'Cost Model Analysis' for function 'test2': -; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) } declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone define void @test3(float* nocapture %f, <4 x float> %b, <4 x float> %c) nounwind { +; CORE2-LABEL: 'test3' +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; CORE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; CORE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; COREI7-LABEL: 'test3' +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = getelementptr inbounds float, float* %f, i64 %index +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = bitcast float* %0 to <4 x float>* +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %wide.load = load <4 x float>, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> %2, <4 x float>* %1, align 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %index.next = add i64 %index, 4 +; COREI7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp eq i64 %index.next, 1024 +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: br i1 %3, label %for.end, label %vector.body +; COREI7-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; vector.ph: br label %vector.body @@ -77,11 +148,7 @@ for.end: ; preds = %vector.body ret void -; CORE2: Printing analysis 'Cost Model Analysis' for function 'test3': -; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) -; COREI7: Printing analysis 'Cost Model Analysis' for function 'test3': -; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) } diff --git a/llvm/test/Analysis/CostModel/X86/load_store.ll b/llvm/test/Analysis/CostModel/X86/load_store.ll --- a/llvm/test/Analysis/CostModel/X86/load_store.ll +++ b/llvm/test/Analysis/CostModel/X86/load_store.ll @@ -1,81 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" define i32 @stores(i32 %arg) { - - ;CHECK: cost of 1 {{.*}} store +; CHECK-LABEL: 'stores' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, i8* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, i16* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, i32* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, i64* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 undef, i128* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> undef, <4 x i16>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, <4 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, <4 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, <8 x i16>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, <8 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, <8 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; store i8 undef, i8* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store i16 undef, i16* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store i32 undef, i32* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store i64 undef, i64* undef, align 4 - ;CHECK: cost of 2 {{.*}} store store i128 undef, i128* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store <4 x i16> undef, <4 x i16>* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store <4 x i32> undef, <4 x i32>* undef, align 4 - ;CHECK: cost of 2 {{.*}} store store <4 x i64> undef, <4 x i64>* undef, align 4 - ;CHECK: cost of 1 {{.*}} store store <8 x i16> undef, <8 x i16>* undef, align 4 - ;CHECK: cost of 2 {{.*}} store store <8 x i32> undef, <8 x i32>* undef, align 4 - ;CHECK: cost of 4 {{.*}} store store <8 x i64> undef, <8 x i64>* undef, align 4 ret i32 undef } define i32 @loads(i32 %arg) { - ;CHECK: cost of 1 {{.*}} load +; CHECK-LABEL: 'loads' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, i8* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, i16* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, i32* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, i64* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load i128, i128* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i32>, <2 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <4 x i32>, <4 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; load i8, i8* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load i16, i16* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load i32, i32* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load i64, i64* undef, align 4 - ;CHECK: cost of 2 {{.*}} load load i128, i128* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load <2 x i32>, <2 x i32>* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load <4 x i32>, <4 x i32>* undef, align 4 - ;CHECK: cost of 2 {{.*}} load load <8 x i32>, <8 x i32>* undef, align 4 - ;CHECK: cost of 1 {{.*}} load load <2 x i64>, <2 x i64>* undef, align 4 - ;CHECK: cost of 2 {{.*}} load load <4 x i64>, <4 x i64>* undef, align 4 - ;CHECK: cost of 4 {{.*}} load load <8 x i64>, <8 x i64>* undef, align 4 - ;CHECK: cost of 3 {{.*}} load load <3 x float>, <3 x float>* undef, align 4 - ;CHECK: cost of 3 {{.*}} load load <3 x double>, <3 x double>* undef, align 4 - ;CHECK: cost of 3 {{.*}} load load <3 x i32>, <3 x i32>* undef, align 4 - ;CHECK: cost of 3 {{.*}} load load <3 x i64>, <3 x i64>* undef, align 4 - ;CHECK: cost of 10 {{.*}} load load <5 x i32>, <5 x i32>* undef, align 4 - ;CHECK: cost of 10 {{.*}} load load <5 x i64>, <5 x i64>* undef, align 4 ret i32 undef diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -9,32 +9,59 @@ ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze | FileCheck %s --check-prefixes=CHECK,AVX512,SKX define i32 @masked_load() { -; SSE-LABEL: 'masked_load' -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_load' +; SSE2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 222 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 238 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 428 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 214 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1636 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 818 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 409 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_load' +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX-LABEL: 'masked_load' ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) @@ -53,12 +80,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -80,12 +107,12 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -151,32 +178,59 @@ } define i32 @masked_store() { -; SSE-LABEL: 'masked_store' -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_store' +; SSE2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 210 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 226 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 428 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1456 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 728 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 364 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_store' +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX-LABEL: 'masked_store' ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) @@ -195,12 +249,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -222,12 +276,12 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -293,83 +347,110 @@ } define i32 @masked_gather() { -; SSE-LABEL: 'masked_gather' -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_gather' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 908 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 454 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 227 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_gather' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX1-LABEL: 'masked_gather' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX2-LABEL: 'masked_gather' -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -391,19 +472,19 @@ ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_gather' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) @@ -411,19 +492,19 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -445,12 +526,12 @@ ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -489,63 +570,90 @@ } define i32 @masked_scatter() { -; SSE-LABEL: 'masked_scatter' -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_scatter' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 728 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 364 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 182 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_scatter' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX-LABEL: 'masked_scatter' -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_scatter' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) @@ -553,19 +661,19 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -587,12 +695,12 @@ ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 88 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 @@ -631,86 +739,140 @@ } define i32 @masked_expandload() { -; SSE-LABEL: 'masked_expandload' -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_expandload' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 908 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 454 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 227 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_expandload' +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX-LABEL: 'masked_expandload' -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; -; AVX512-LABEL: 'masked_expandload' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; KNL-LABEL: 'masked_expandload' +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_expandload' +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) @@ -746,86 +908,140 @@ } define i32 @masked_compressstore() { -; SSE-LABEL: 'masked_compressstore' -; SSE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; SSE2-LABEL: 'masked_compressstore' +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 193 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 209 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 395 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 197 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1391 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 695 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SSE42-LABEL: 'masked_compressstore' +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX-LABEL: 'masked_compressstore' -; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 255 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; -; AVX512-LABEL: 'masked_compressstore' -; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; KNL-LABEL: 'masked_compressstore' +; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 223 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; +; SKX-LABEL: 'masked_compressstore' +; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) @@ -863,7 +1079,7 @@ define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { ; SSE2-LABEL: 'test1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; SSE42-LABEL: 'test1' @@ -887,10 +1103,15 @@ } define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { -; SSE-LABEL: 'test2' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; SSE2-LABEL: 'test2' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test2' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX-LABEL: 'test2' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -908,10 +1129,15 @@ } define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { -; SSE-LABEL: 'test3' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test3' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test3' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test3' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -929,10 +1155,15 @@ } define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { -; SSE-LABEL: 'test4' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; SSE2-LABEL: 'test4' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; +; SSE42-LABEL: 'test4' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res ; ; AVX1-LABEL: 'test4' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer @@ -960,10 +1191,15 @@ } define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; SSE-LABEL: 'test5' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test5' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test5' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test5' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -981,10 +1217,15 @@ } define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { -; SSE-LABEL: 'test6' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test6' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test6' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test6' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -1002,10 +1243,15 @@ } define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { -; SSE-LABEL: 'test7' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; SSE2-LABEL: 'test7' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; +; SSE42-LABEL: 'test7' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; ; AVX-LABEL: 'test7' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -1023,10 +1269,15 @@ } define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { -; SSE-LABEL: 'test8' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; SSE2-LABEL: 'test8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; +; SSE42-LABEL: 'test8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res ; ; AVX-LABEL: 'test8' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -1044,9 +1295,13 @@ } define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) { -; SSE-LABEL: 'test_gather_2f64' -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; SSE2-LABEL: 'test_gather_2f64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; SSE42-LABEL: 'test_gather_2f64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX1-LABEL: 'test_gather_2f64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) @@ -1069,9 +1324,13 @@ } define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { -; SSE-LABEL: 'test_gather_4i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; SSE2-LABEL: 'test_gather_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX1-LABEL: 'test_gather_4i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) @@ -1098,9 +1357,13 @@ } define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) { -; SSE-LABEL: 'test_gather_4i32_const_mask' -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; SSE2-LABEL: 'test_gather_4i32_const_mask' +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SSE42-LABEL: 'test_gather_4i32_const_mask' +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX1-LABEL: 'test_gather_4i32_const_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) @@ -1130,25 +1393,25 @@ ; SSE2-LABEL: 'test_gather_16f32_const_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_const_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_const_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_const_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_const_mask' @@ -1174,25 +1437,25 @@ ; SSE2-LABEL: 'test_gather_16f32_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 222 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_var_mask' @@ -1218,25 +1481,25 @@ ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 222 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_ra_var_mask' @@ -1260,11 +1523,11 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) { ; SSE2-LABEL: 'test_gather_16f32_const_mask2' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_const_mask2' @@ -1272,7 +1535,7 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_const_mask2' @@ -1280,7 +1543,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_const_mask2' @@ -1288,7 +1551,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_const_mask2' @@ -1318,20 +1581,28 @@ } define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { -; SSE-LABEL: 'test_scatter_16i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test_scatter_16i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE2-NEXT: Cost Model: Found an estimated cost of 226 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_16i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_scatter_16i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_scatter_16i32' @@ -1339,7 +1610,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKL-LABEL: 'test_scatter_16i32' @@ -1347,7 +1618,7 @@ ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SKL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_16i32' @@ -1368,12 +1639,16 @@ } define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) { -; SSE-LABEL: 'test_scatter_8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test_scatter_8i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_8i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_8i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_8i32' @@ -1385,9 +1660,13 @@ } define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { -; SSE-LABEL: 'test_scatter_4i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-LABEL: 'test_scatter_4i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'test_scatter_4i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_4i32' ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) @@ -1409,25 +1688,25 @@ ; SSE2-LABEL: 'test_gather_4f32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SSE42-LABEL: 'test_gather_4f32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX1-LABEL: 'test_gather_4f32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX2-LABEL: 'test_gather_4f32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKL-LABEL: 'test_gather_4f32' @@ -1459,25 +1738,25 @@ ; SSE2-LABEL: 'test_gather_4f32_const_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SSE42-LABEL: 'test_gather_4f32_const_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX1-LABEL: 'test_gather_4f32_const_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX2-LABEL: 'test_gather_4f32_const_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKL-LABEL: 'test_gather_4f32_const_mask' diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll @@ -38,7 +38,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX-NEXT: Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -51,7 +51,7 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -64,7 +64,7 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -103,7 +103,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -115,7 +115,7 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -127,7 +127,7 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -408,7 +408,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> @@ -432,7 +432,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> @@ -467,7 +467,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> @@ -491,7 +491,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> @@ -526,7 +526,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> @@ -550,7 +550,7 @@ ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> @@ -583,9 +583,9 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> @@ -607,9 +607,9 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> @@ -703,7 +703,7 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> @@ -727,7 +727,7 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> @@ -818,9 +818,9 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> @@ -842,9 +842,9 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> @@ -937,9 +937,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> @@ -961,9 +961,9 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> @@ -1437,9 +1437,9 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll @@ -51,11 +51,11 @@ ; AVX512-NEXT: Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -123,11 +123,11 @@ ; AVX512-NEXT: Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll --- a/llvm/test/Analysis/CostModel/X86/sitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll @@ -137,7 +137,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double ; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'sitofp_i64_double' @@ -280,18 +280,18 @@ ; ; AVX-LABEL: 'sitofp_i64_float' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'sitofp_i64_float' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'sitofp_i64_float' @@ -304,10 +304,10 @@ ; ; BTVER2-LABEL: 'sitofp_i64_float' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float -; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %cvt_i64_f32 = sitofp i64 undef to float diff --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll --- a/llvm/test/Analysis/CostModel/X86/uitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll @@ -287,10 +287,10 @@ ; ; AVX-LABEL: 'uitofp_i64_float' ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> -; AVX-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'uitofp_i64_float' @@ -311,10 +311,10 @@ ; ; BTVER2-LABEL: 'uitofp_i64_float' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float -; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %cvt_i64_f32 = uitofp i64 undef to float diff --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll --- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll +++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll @@ -34,12 +34,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'extract_double' @@ -48,12 +48,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'extract_double' @@ -62,12 +62,12 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2f64_a = extractelement <2 x double> undef, i32 %arg @@ -118,12 +118,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'extract_float' @@ -137,12 +137,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'extract_float' @@ -156,12 +156,12 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2f32_a = extractelement <2 x float> undef, i32 %arg @@ -191,43 +191,43 @@ ; SSE2-LABEL: 'extract_i64' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'extract_i64' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'extract_i64' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'extract_i64' @@ -250,12 +250,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'extract_i64' @@ -264,12 +264,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i64' @@ -278,12 +278,12 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'extract_i64' @@ -306,12 +306,12 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2i64_a = extractelement <2 x i64> undef, i32 %arg @@ -335,58 +335,58 @@ ; SSE2-LABEL: 'extract_i32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'extract_i32' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'extract_i32' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'extract_i32' @@ -418,13 +418,13 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'extract_i32' @@ -437,13 +437,13 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i32' @@ -455,14 +455,14 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'extract_i32' @@ -494,13 +494,13 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2i32_a = extractelement <2 x i32> undef, i32 %arg @@ -610,36 +610,55 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512-LABEL: 'extract_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-LABEL: 'extract_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'extract_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg @@ -647,17 +666,17 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'extract_i16' @@ -686,16 +705,16 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v8i16_a = extractelement <8 x i16> undef, i32 %arg @@ -724,73 +743,73 @@ ; SSE2-LABEL: 'extract_i8' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'extract_i8' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; SSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'extract_i8' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'extract_i8' @@ -827,43 +846,67 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512-LABEL: 'extract_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-LABEL: 'extract_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'extract_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg @@ -872,21 +915,21 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'extract_i8' @@ -923,18 +966,18 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v16i8_a = extractelement <16 x i8> undef, i32 %arg diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert.ll b/llvm/test/Analysis/CostModel/X86/vector-insert.ll --- a/llvm/test/Analysis/CostModel/X86/vector-insert.ll +++ b/llvm/test/Analysis/CostModel/X86/vector-insert.ll @@ -34,12 +34,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'insert_double' @@ -48,12 +48,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_double' @@ -62,12 +62,12 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2f64_a = insertelement <2 x double> undef, double undef, i32 %arg @@ -91,39 +91,39 @@ ; SSE-LABEL: 'insert_float' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'insert_float' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'insert_float' @@ -136,32 +136,32 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8 -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_float' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg @@ -188,19 +188,103 @@ } define i32 @insert_i64(i32 %arg) { -; CHECK-LABEL: 'insert_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'insert_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'insert_i64' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'insert_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'insert_i64' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'insert_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'insert_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'insert_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_i64' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg @@ -208,12 +292,12 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg @@ -234,24 +318,138 @@ } define i32 @insert_i32(i32 %arg) { -; CHECK-LABEL: 'insert_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'insert_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'insert_i32' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'insert_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'insert_i32' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'insert_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'insert_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'insert_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_i32' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg @@ -263,13 +461,13 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg @@ -296,24 +494,81 @@ } define i32 @insert_i16(i32 %arg) { -; CHECK-LABEL: 'insert_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'insert_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'insert_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'insert_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'insert_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg @@ -322,16 +577,16 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg @@ -357,29 +612,197 @@ } define i32 @insert_i8(i32 %arg) { -; CHECK-LABEL: 'insert_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'insert_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'insert_i8' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'insert_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'insert_i8' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'insert_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'insert_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'insert_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'insert_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'insert_i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg @@ -391,18 +814,18 @@ ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -6,32 +6,23 @@ define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) { ; NORMAL-LABEL: @foo( ; NORMAL-NEXT: entry: -; NORMAL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NORMAL: vector.ph: -; NORMAL-NEXT: br label [[VECTOR_BODY:%.*]] -; NORMAL: vector.body: -; NORMAL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NORMAL-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; NORMAL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; NORMAL-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* -; NORMAL-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 -; NORMAL-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> -; NORMAL-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> -; NORMAL-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] -; NORMAL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; NORMAL-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; NORMAL-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 -; NORMAL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; NORMAL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; NORMAL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 -; NORMAL: middle.block: -; NORMAL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; NORMAL: scalar.ph: ; NORMAL-NEXT: br label [[FOR_BODY:%.*]] ; NORMAL: for.cond.cleanup: ; NORMAL-NEXT: ret void ; NORMAL: for.body: -; NORMAL-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2 +; NORMAL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NORMAL-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; NORMAL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; NORMAL-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; NORMAL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; NORMAL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] +; NORMAL-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; NORMAL-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP3]], [[TMP1]] +; NORMAL-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; NORMAL-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX6]], align 4 +; NORMAL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NORMAL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; NORMAL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; ; SLOW-LABEL: @foo( ; SLOW-NEXT: entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -18,77 +18,59 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa !1 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7 -; CHECK-NEXT: [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP18]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP37]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP37]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP25]], [[TMP24]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4 ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -247,50 +247,11 @@ } define <8 x i32> @sext_zext(<8 x i16> %a) { -; SSE-LABEL: @sext_zext( -; SSE-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; -; SLM-LABEL: @sext_zext( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i16> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i16> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = sext i16 [[A0]] to i32 -; SLM-NEXT: [[AB1:%.*]] = sext i16 [[A1]] to i32 -; SLM-NEXT: [[AB2:%.*]] = sext i16 [[A2]] to i32 -; SLM-NEXT: [[AB3:%.*]] = sext i16 [[A3]] to i32 -; SLM-NEXT: [[AB4:%.*]] = zext i16 [[A4]] to i32 -; SLM-NEXT: [[AB5:%.*]] = zext i16 [[A5]] to i32 -; SLM-NEXT: [[AB6:%.*]] = zext i16 [[A6]] to i32 -; SLM-NEXT: [[AB7:%.*]] = zext i16 [[A7]] to i32 -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[R7]] -; -; AVX-LABEL: @sext_zext( -; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[R7]] -; -; AVX512-LABEL: @sext_zext( -; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; CHECK-LABEL: @sext_zext( +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 @@ -426,26 +387,24 @@ ; SLM-NEXT: ret <8 x float> [[R7]] ; ; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; AVX-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; AVX-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; AVX-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; AVX-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; AVX-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; AVX-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; AVX-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; AVX-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float -; AVX-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float +; AVX-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> +; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> ; AVX-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; AVX-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; AVX-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; AVX-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; AVX-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; AVX-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3 ; AVX-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; AVX-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; AVX-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -49,40 +49,11 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; SSE-LABEL: @fmul_fdiv_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R7]] -; -; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> -; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R7]] -; -; AVX-LABEL: @fmul_fdiv_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @fmul_fdiv_v8f32( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @fmul_fdiv_v8f32( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R7]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -125,14 +96,15 @@ ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1 ; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -101,44 +101,11 @@ } define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @ashr_shl_v8i32( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; SSE-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; -; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[R7]] -; -; AVX-LABEL: @ashr_shl_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[R7]] -; -; AVX512-LABEL: @ashr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; CHECK-LABEL: @ashr_shl_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -243,34 +210,27 @@ ; SSE-LABEL: @ashr_lshr_shl_v8i32( ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 -; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] -; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] -; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] -; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] +; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -19,27 +19,38 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; CHECK-LABEL: @splat( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]] -; CHECK-NEXT: store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] +; CHECK-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3) +; CHECK-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] +; CHECK-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] +; CHECK-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5) +; CHECK-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] +; CHECK-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6) +; CHECK-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] +; CHECK-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7) +; CHECK-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) +; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9) +; CHECK-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10) +; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11) +; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12) +; CHECK-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13) +; CHECK-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14) +; CHECK-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] +; CHECK-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15) ; CHECK-NEXT: ret void ; %1 = xor i8 %c, %a @@ -84,20 +95,18 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @same_opcode_on_one_side( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]] -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]] +; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]] +; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]] +; CHECK-NEXT: store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]] +; CHECK-NEXT: store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) ; CHECK-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -37,25 +37,39 @@ define void @fn2(i32* %a, i32* %b, float* %c) { ; CHECK-LABEL: @fn2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1 +; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] +; CHECK-NEXT: [[FP1:%.*]] = sitofp i32 [[ADD1]] to float +; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[FP1]], i32 [[ADD1]]) #2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] +; CHECK-NEXT: [[FP2:%.*]] = sitofp i32 [[ADD2]] to float +; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[FP2]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 +; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 +; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] +; CHECK-NEXT: [[FP3:%.*]] = sitofp i32 [[ADD3]] to float +; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[FP3]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]]) -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 +; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] +; CHECK-NEXT: [[FP4:%.*]] = sitofp i32 [[ADD4]] to float +; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[FP4]], i32 [[ADD1]]) #2 +; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 +; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 +; CHECK-NEXT: store float [[CALL3]], float* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store float [[CALL4]], float* [[ARRAYIDX10]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -221,14 +221,10 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R07]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: ret <8 x float> [[TMP3]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -288,33 +284,11 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; SSE-LABEL: @test_v4i64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x i64> [[R03]] -; -; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <4 x i64> [[TMP3]] -; -; AVX-LABEL: @test_v4i64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <4 x i64> [[TMP3]] -; -; AVX512-LABEL: @test_v4i64( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <4 x i64> [[TMP3]] +; CHECK-LABEL: @test_v4i64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -336,33 +310,11 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @test_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R07]] -; -; SLM-LABEL: @test_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <8 x i32> [[TMP3]] -; -; AVX-LABEL: @test_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <8 x i32> [[TMP3]] -; -; AVX512-LABEL: @test_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <8 x i32> [[TMP3]] +; CHECK-LABEL: @test_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -221,14 +221,10 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R07]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: ret <8 x float> [[TMP3]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -288,33 +284,11 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; SSE-LABEL: @test_v4i64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x i64> [[R03]] -; -; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <4 x i64> [[TMP3]] -; -; AVX-LABEL: @test_v4i64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <4 x i64> [[TMP3]] -; -; AVX512-LABEL: @test_v4i64( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <4 x i64> [[TMP3]] +; CHECK-LABEL: @test_v4i64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -336,33 +310,11 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @test_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R07]] -; -; SLM-LABEL: @test_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <8 x i32> [[TMP3]] -; -; AVX-LABEL: @test_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <8 x i32> [[TMP3]] -; -; AVX512-LABEL: @test_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <8 x i32> [[TMP3]] +; CHECK-LABEL: @test_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -214,77 +214,44 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { -; BIGTHRESH-LABEL: @simple_select_no_users( -; BIGTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; BIGTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; BIGTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; BIGTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; BIGTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; BIGTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; BIGTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; BIGTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; BIGTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; BIGTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; BIGTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; BIGTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; BIGTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 -; BIGTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; BIGTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; BIGTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; BIGTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; BIGTHRESH-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; BIGTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 -; BIGTHRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; BIGTHRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 -; BIGTHRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; BIGTHRESH-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; BIGTHRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; BIGTHRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; BIGTHRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; BIGTHRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; BIGTHRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; BIGTHRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; BIGTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; BIGTHRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; BIGTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; BIGTHRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; BIGTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; BIGTHRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; BIGTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 -; BIGTHRESH-NEXT: ret <4 x float> [[RD]] -; -; ZEROTHRESH-LABEL: @simple_select_no_users( -; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1 -; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1 -; ZEROTHRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1 -; ZEROTHRESH-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0 -; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 -; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2 -; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3 -; ZEROTHRESH-NEXT: ret <4 x float> [[RD]] +; ANY-LABEL: @simple_select_no_users( +; ANY-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 +; ANY-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; ANY-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 +; ANY-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 +; ANY-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; ANY-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; ANY-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; ANY-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; ANY-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; ANY-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 +; ANY-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 +; ANY-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 +; ANY-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 +; ANY-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 +; ANY-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer +; ANY-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ANY-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 +; ANY-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer +; ANY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 +; ANY-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 +; ANY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 +; ANY-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 +; ANY-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; ANY-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ANY-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 +; ANY-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ANY-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 +; ANY-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; ANY-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; ANY-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 +; ANY-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; ANY-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 +; ANY-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ANY-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 +; ANY-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ANY-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 +; ANY-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -111,13 +111,15 @@ ; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP6]], i32 1 ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float ; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -13,16 +13,13 @@ define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { ; CHECK-LABEL: @PR31243_zext( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; CHECK-NEXT: [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] ; CHECK-NEXT: ret i8 [[TMP8]] @@ -56,16 +53,12 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { ; CHECK-LABEL: @PR31243_sext( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -15,17 +15,16 @@ ; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 ; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 +; CHECK-NEXT: [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 +; CHECK-NEXT: [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8 ; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]] +; CHECK-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 +; CHECK-NEXT: store i64 [[AND_2]], i64* [[BAR3]], align 8 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: store i64 [[AND_1]], i64* [[BAR4]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: @@ -50,30 +49,32 @@ ; CHECK-LABEL: @pr35497( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; CHECK-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; CHECK-NEXT: [[ADD:%.*]] = add i64 undef, undef ; CHECK-NEXT: store i64 [[ADD]], i64* undef, align 1 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], +; CHECK-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 +; CHECK-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] +; CHECK-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 +; CHECK-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 ; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], +; CHECK-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 +; CHECK-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 +; CHECK-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 +; CHECK-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] +; CHECK-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 ; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1 +; CHECK-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 +; CHECK-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] +; CHECK-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -110,7 +110,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' - ; YAML-NEXT: - Cost: '-8' + ; YAML-NEXT: - Cost: '-5' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' @@ -120,7 +120,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-4' + ; YAML-NEXT: - Cost: '-7' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '1' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,70 +12,84 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1 +; CHECK-NEXT: store i8 [[CONV_I_I1199]], i8* [[TMP0]], align 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 +; CHECK-NEXT: store i8 [[CONV_1_I_I]], i8* [[ARRAYIDX_I_I7_1_I_I]], align 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 +; CHECK-NEXT: store i8 [[CONV_2_I_I]], i8* [[ARRAYIDX_I_I7_2_I_I]], align 1 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 +; CHECK-NEXT: store i8 [[CONV_3_I_I]], i8* [[ARRAYIDX_I_I7_3_I_I]], align 1 +; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8 +; CHECK-NEXT: [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 +; CHECK-NEXT: store i8 [[CONV_4_I_I]], i8* [[ARRAYIDX_I_I7_4_I_I]], align 1 +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8 +; CHECK-NEXT: [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 +; CHECK-NEXT: store i8 [[CONV_5_I_I]], i8* [[ARRAYIDX_I_I7_5_I_I]], align 1 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8 +; CHECK-NEXT: [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 +; CHECK-NEXT: store i8 [[CONV_6_I_I]], i8* [[ARRAYIDX_I_I7_6_I_I]], align 1 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8 +; CHECK-NEXT: [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], +; CHECK-NEXT: store i8 [[CONV_7_I_I]], i8* [[ARRAYIDX_I_I7_7_I_I]], align 1 +; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8 +; CHECK-NEXT: [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 +; CHECK-NEXT: store i8 [[CONV_8_I_I]], i8* [[ARRAYIDX_I_I7_8_I_I]], align 1 +; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8 +; CHECK-NEXT: [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 +; CHECK-NEXT: store i8 [[CONV_9_I_I]], i8* [[ARRAYIDX_I_I7_9_I_I]], align 1 +; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8 +; CHECK-NEXT: [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 +; CHECK-NEXT: store i8 [[CONV_10_I_I]], i8* [[ARRAYIDX_I_I7_10_I_I]], align 1 +; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8 +; CHECK-NEXT: [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], +; CHECK-NEXT: store i8 [[CONV_11_I_I]], i8* [[ARRAYIDX_I_I7_11_I_I]], align 1 +; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8 +; CHECK-NEXT: [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 +; CHECK-NEXT: store i8 [[CONV_12_I_I]], i8* [[ARRAYIDX_I_I7_12_I_I]], align 1 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8 +; CHECK-NEXT: [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 +; CHECK-NEXT: store i8 [[CONV_13_I_I]], i8* [[ARRAYIDX_I_I7_13_I_I]], align 1 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8 +; CHECK-NEXT: [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 +; CHECK-NEXT: store i8 [[CONV_14_I_I]], i8* [[ARRAYIDX_I_I7_14_I_I]], align 1 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8 +; CHECK-NEXT: [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 +; CHECK-NEXT: store i8 [[CONV_15_I_I]], i8* [[ARRAYIDX_I_I7_15_I_I]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -43,23 +43,40 @@ } define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) { -; SSE-LABEL: @loadext_4i8_to_4i32( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i32> [[V3]] +; SSE2-LABEL: @loadext_4i8_to_4i32( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE2-NEXT: ret <4 x i32> [[V3]] +; +; SLM-LABEL: @loadext_4i8_to_4i32( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 +; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 +; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 +; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -115,58 +132,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i8_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i8_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i8_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i8_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -805,58 +786,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i16_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i16_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i16_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i16_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -1058,58 +1003,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i32_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i32_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i32_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i32_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll @@ -132,58 +132,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i8_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i8_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i8_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i8_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -822,58 +786,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i16_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i16_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i16_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i16_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -1075,58 +1003,22 @@ ; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE-NEXT: ret <4 x i64> [[V3]] ; -; AVX1-LABEL: @loadext_4i32_to_4i64( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX1-NEXT: ret <4 x i64> [[V3]] -; -; AVX2-LABEL: @loadext_4i32_to_4i64( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX2-NEXT: ret <4 x i64> [[V3]] -; -; AVX512-LABEL: @loadext_4i32_to_4i64( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX512-NEXT: ret <4 x i64> [[V3]] +; AVX-LABEL: @loadext_4i32_to_4i64( +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2