diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3466,8 +3466,7 @@ auto *MaskTy = FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || - (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) || - !isPowerOf2_32(NumElem)) { + (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { // Scalarization APInt DemandedElts = APInt::getAllOnesValue(NumElem); InstructionCost MaskSplitCost = @@ -3491,11 +3490,11 @@ InstructionCost Cost = 0; if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) - // Promotion requires expand/truncate for data and a shuffle for mask. + // Promotion requires extend/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); - else if (LT.second.getVectorNumElements() > NumElem) { + else if (LT.first * LT.second.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -129,51 +129,51 @@ ; ; AVX-LABEL: 'masked_load' ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>* undef, i32 1, <1 x i1> undef, <1 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>* undef, i32 1, <1 x i1> undef, <1 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) @@ -188,51 +188,51 @@ ; ; KNL-LABEL: 'masked_load' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>* undef, i32 1, <1 x i1> undef, <1 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>* undef, i32 1, <1 x i1> undef, <1 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) @@ -247,51 +247,51 @@ ; ; SKX-LABEL: 'masked_load' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0v7f64(<7 x double>* undef, i32 1, <7 x i1> undef, <7 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0v6f64(<6 x double>* undef, i32 1, <6 x i1> undef, <6 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0v5f64(<5 x double>* undef, i32 1, <5 x i1> undef, <5 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0v3f64(<3 x double>* undef, i32 1, <3 x i1> undef, <3 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0v15f32(<15 x float>* undef, i32 1, <15 x i1> undef, <15 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* undef, i32 1, <14 x i1> undef, <14 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* undef, i32 1, <13 x i1> undef, <13 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0v12f32(<12 x float>* undef, i32 1, <12 x i1> undef, <12 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0v11f32(<11 x float>* undef, i32 1, <11 x i1> undef, <11 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0v10f32(<10 x float>* undef, i32 1, <10 x i1> undef, <10 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* undef, i32 1, <9 x i1> undef, <9 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* undef, i32 1, <7 x i1> undef, <7 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0v6f32(<6 x float>* undef, i32 1, <6 x i1> undef, <6 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0v5f32(<5 x float>* undef, i32 1, <5 x i1> undef, <5 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0v3f32(<3 x float>* undef, i32 1, <3 x i1> undef, <3 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0v1f32(<1 x float>* undef, i32 1, <1 x i1> undef, <1 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0v7i64(<7 x i64>* undef, i32 1, <7 x i1> undef, <7 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0v6i64(<6 x i64>* undef, i32 1, <6 x i1> undef, <6 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0v5i64(<5 x i64>* undef, i32 1, <5 x i1> undef, <5 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0v3i64(<3 x i64>* undef, i32 1, <3 x i1> undef, <3 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0v15i32(<15 x i32>* undef, i32 1, <15 x i1> undef, <15 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0v14i32(<14 x i32>* undef, i32 1, <14 x i1> undef, <14 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0v13i32(<13 x i32>* undef, i32 1, <13 x i1> undef, <13 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0v12i32(<12 x i32>* undef, i32 1, <12 x i1> undef, <12 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0v11i32(<11 x i32>* undef, i32 1, <11 x i1> undef, <11 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0v10i32(<10 x i32>* undef, i32 1, <10 x i1> undef, <10 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0v9i32(<9 x i32>* undef, i32 1, <9 x i1> undef, <9 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0v7i32(<7 x i32>* undef, i32 1, <7 x i1> undef, <7 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0v6i32(<6 x i32>* undef, i32 1, <6 x i1> undef, <6 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0v5i32(<5 x i32>* undef, i32 1, <5 x i1> undef, <5 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0v3i32(<3 x i32>* undef, i32 1, <3 x i1> undef, <3 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>* undef, i32 1, <1 x i1> undef, <1 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) @@ -490,51 +490,51 @@ ; ; AVX-LABEL: 'masked_store' ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 58 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> undef, <1 x float>* undef, i32 1, <1 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32> undef, <1 x i32>* undef, i32 1, <1 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) @@ -549,51 +549,51 @@ ; ; KNL-LABEL: 'masked_store' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> undef, <1 x float>* undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32> undef, <1 x i32>* undef, i32 1, <1 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 168 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) @@ -608,51 +608,51 @@ ; ; SKX-LABEL: 'masked_store' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0v7f64(<7 x double> undef, <7 x double>* undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0v6f64(<6 x double> undef, <6 x double>* undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0v5f64(<5 x double> undef, <5 x double>* undef, i32 1, <5 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0v3f64(<3 x double> undef, <3 x double>* undef, i32 1, <3 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0v15f32(<15 x float> undef, <15 x float>* undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> undef, <14 x float>* undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> undef, <13 x float>* undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0v12f32(<12 x float> undef, <12 x float>* undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0v11f32(<11 x float> undef, <11 x float>* undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0v10f32(<10 x float> undef, <10 x float>* undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> undef, <9 x float>* undef, i32 1, <9 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> undef, <7 x float>* undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0v6f32(<6 x float> undef, <6 x float>* undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0v5f32(<5 x float> undef, <5 x float>* undef, i32 1, <5 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0v3f32(<3 x float> undef, <3 x float>* undef, i32 1, <3 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> undef, <1 x float>* undef, i32 1, <1 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0v7i64(<7 x i64> undef, <7 x i64>* undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0v6i64(<6 x i64> undef, <6 x i64>* undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0v5i64(<5 x i64> undef, <5 x i64>* undef, i32 1, <5 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0v3i64(<3 x i64> undef, <3 x i64>* undef, i32 1, <3 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0v15i32(<15 x i32> undef, <15 x i32>* undef, i32 1, <15 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0v14i32(<14 x i32> undef, <14 x i32>* undef, i32 1, <14 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0v13i32(<13 x i32> undef, <13 x i32>* undef, i32 1, <13 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0v12i32(<12 x i32> undef, <12 x i32>* undef, i32 1, <12 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0v11i32(<11 x i32> undef, <11 x i32>* undef, i32 1, <11 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0v10i32(<10 x i32> undef, <10 x i32>* undef, i32 1, <10 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0v9i32(<9 x i32> undef, <9 x i32>* undef, i32 1, <9 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0v7i32(<7 x i32> undef, <7 x i32>* undef, i32 1, <7 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0v6i32(<6 x i32> undef, <6 x i32>* undef, i32 1, <6 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0v5i32(<5 x i32> undef, <5 x i32>* undef, i32 1, <5 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> undef, <3 x i32>* undef, i32 1, <3 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32> undef, <1 x i32>* undef, i32 1, <1 x i1> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)