Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1136,6 +1136,21 @@ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -1315,6 +1330,21 @@ // Custom CTPOP always performs better on natively supported v8i32 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + + // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -6120,7 +6120,7 @@ defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; // AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns { +multiclass SS41I_pmovx_avx2_patterns { // Register-Register patterns def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BWYrr) VR128:$src)>; @@ -6154,6 +6154,22 @@ def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), (!cast(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + // Simple Register-Memory patterns + def : Pat<(v16i16 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v8i32 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi32") addr:$src)), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + // AVX2 Register-Memory patterns def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWYrm) addr:$src)>; @@ -6211,13 +6227,13 @@ } let Predicates = [HasAVX2] in { - defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; - defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; } // SSE4.1/AVX patterns. -multiclass SS41I_pmovx_patterns { +multiclass SS41I_pmovx_patterns { def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BWrr) VR128:$src)>; def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), @@ -6233,6 +6249,21 @@ def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), (!cast(OpcPrefix#DQrr) VR128:$src)>; + def : Pat<(v8i16 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v4i32 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi32") addr:$src)), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), @@ -6295,13 +6326,13 @@ } let Predicates = [HasAVX] in { - defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// Index: llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll +++ llvm/trunk/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll @@ -15,8 +15,6 @@ ; CHECK-LABEL: test_avx2_pmovx_256 ; We really don't care about the generated code. ; CHECK: vpmovzxbd -; CHECK: vpbroadcastd -; CHECK: vpand ; CHECK: vcvtdq2ps ; CHECK: vmovups ; CHECK: vzeroupper Index: llvm/trunk/test/CodeGen/X86/pointer-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pointer-vector.ll +++ llvm/trunk/test/CodeGen/X86/pointer-vector.ll @@ -81,8 +81,7 @@ entry: %G = load <4 x i8>* %p ;CHECK: movl -;CHECK: pmovzxbd -;CHECK: pand +;CHECK: pmovzxbd (% %K = inttoptr <4 x i8> %G to <4 x i32*> ;CHECK: ret ret <4 x i32*> %K Index: llvm/trunk/test/CodeGen/X86/widen_load-2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_load-2.ll +++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll @@ -191,8 +191,9 @@ ; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) ; CHECK-NEXT: movb $1, 2(%[[PTR1]]) -; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] -; CHECK-NEXT: pand {{.*}}, %[[X0]] +; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]] +; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]] +; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]] ; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x ; CHECK-NEXT: shrl %e[[R0]]x ; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x