Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1314,8 +1314,10 @@ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1371,14 +1373,14 @@ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1402,13 +1404,13 @@ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, Index: llvm/test/Analysis/CostModel/X86/cast.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/cast.ll +++ llvm/test/Analysis/CostModel/X86/cast.ll @@ -139,11 +139,11 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %B = zext <8 x i16> undef to <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = sext <4 x i32> undef to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <4 x i32> undef to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> @@ -168,12 +168,12 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> @@ -198,12 +198,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> Index: llvm/test/Analysis/CostModel/X86/extend.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/extend.ll +++ llvm/test/Analysis/CostModel/X86/extend.ll @@ -104,8 +104,8 @@ ; ; AVX2-LABEL: 'zext_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> @@ -114,7 +114,7 @@ ; ; AVX512-LABEL: 'zext_vXi16' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> @@ -207,11 +207,11 @@ ; ; AVX2-LABEL: 'zext_vXi8' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> @@ -222,11 +222,11 @@ ; ; AVX512F-LABEL: 'zext_vXi8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> @@ -237,11 +237,11 @@ ; ; AVX512BW-LABEL: 'zext_vXi8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> @@ -518,8 +518,8 @@ ; ; AVX1-LABEL: 'sext_vXi16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> @@ -528,8 +528,8 @@ ; ; AVX2-LABEL: 'sext_vXi16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> @@ -538,7 +538,7 @@ ; ; AVX512-LABEL: 'sext_vXi16' ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> @@ -548,8 +548,8 @@ ; ; BTVER2-LABEL: 'sext_vXi16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> @@ -616,11 +616,11 @@ ; ; AVX1-LABEL: 'sext_vXi8' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> @@ -631,11 +631,11 @@ ; ; AVX2-LABEL: 'sext_vXi8' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> @@ -646,11 +646,11 @@ ; ; AVX512F-LABEL: 'sext_vXi8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> @@ -661,11 +661,11 @@ ; ; AVX512BW-LABEL: 'sext_vXi8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> @@ -676,11 +676,11 @@ ; ; BTVER2-LABEL: 'sext_vXi8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> -; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> Index: llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll +++ llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll @@ -5,7 +5,7 @@ define void @zext256() "min-legal-vector-width"="256" { ; VEC256-LABEL: 'zext256' -; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = zext <8 x i16> undef to <8 x i64> ; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64> ; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32> ; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32> @@ -30,7 +30,7 @@ define void @zext512() "min-legal-vector-width"="512" { ; AVX-LABEL: 'zext512' -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = zext <8 x i16> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32> @@ -63,8 +63,8 @@ define void @sext256() "min-legal-vector-width"="256" { ; VEC256-LABEL: 'sext256' -; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64> -; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = sext <8 x i16> undef to <8 x i64> ; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64> ; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32> ; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32> @@ -72,7 +72,7 @@ ; VEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; VEC512-LABEL: 'sext256' -; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> @@ -91,8 +91,8 @@ define void @sext512() "min-legal-vector-width"="512" { ; AVX-LABEL: 'sext512' -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = sext <8 x i16> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32> @@ -100,7 +100,7 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKX256-LABEL: 'sext512' -; SKX256-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64> ; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> ; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> ; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> @@ -109,7 +109,7 @@ ; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; VEC512-LABEL: 'sext512' -; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> ; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> Index: llvm/test/Transforms/SLPVectorizer/X86/cast.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/cast.ll +++ llvm/test/Transforms/SLPVectorizer/X86/cast.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=CHECK,SSE42 +; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=CHECK,AVX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -76,21 +76,30 @@ } define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) { -; CHECK-LABEL: @test_sext_4i16_to_4i64( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 -; CHECK-NEXT: ret i64 undef +; SSE42-LABEL: @test_sext_4i16_to_4i64( +; SSE42-NEXT: entry: +; SSE42-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>* +; SSE42-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1 +; SSE42-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64> +; SSE42-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; SSE42-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4 +; SSE42-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2 +; SSE42-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 +; SSE42-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>* +; SSE42-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1 +; SSE42-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64> +; SSE42-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>* +; SSE42-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 +; SSE42-NEXT: ret i64 undef +; +; AVX-LABEL: @test_sext_4i16_to_4i64( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>* +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1 +; AVX-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64> +; AVX-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <4 x i64>* +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4 +; AVX-NEXT: ret i64 undef ; entry: %0 = load i16, i16* %B, align 1 Index: llvm/test/Transforms/SLPVectorizer/X86/sext.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -160,24 +160,58 @@ ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -262,125 +296,34 @@ } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { -; SSE-LABEL: @loadext_8i8_to_8i32( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE-NEXT: ret <8 x i32> [[V7]] -; -; AVX1-LABEL: @loadext_8i8_to_8i32( -; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX1-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; AVX1-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; AVX1-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; AVX1-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; AVX1-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; AVX1-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; AVX1-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; AVX1-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; AVX1-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; AVX1-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; AVX1-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX1-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX1-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX1-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; AVX1-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; AVX1-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; AVX1-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; AVX1-NEXT: ret <8 x i32> [[V7]] -; -; AVX2-LABEL: @loadext_8i8_to_8i32( -; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; AVX2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; AVX2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; AVX2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX2-NEXT: ret <8 x i32> [[V7]] -; -; AVX512-LABEL: @loadext_8i8_to_8i32( -; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX512-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; AVX512-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; AVX512-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; AVX512-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX512-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX512-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX512-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX512-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX512-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX512-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX512-NEXT: ret <8 x i32> [[V7]] +; CHECK-LABEL: @loadext_8i8_to_8i32( +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 +; CHECK-NEXT: ret <8 x i32> [[V7]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -655,24 +598,58 @@ ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 Index: llvm/test/Transforms/SLPVectorizer/X86/zext.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/zext.ll +++ llvm/test/Transforms/SLPVectorizer/X86/zext.ll @@ -125,24 +125,58 @@ ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -529,24 +563,58 @@ ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; AVX-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2