Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1138,6 +1138,45 @@ Custom); } + // SSE41 also brings vector sign/zero extending loads. + for (int MemI = MVT::FIRST_INTEGER_VECTOR_VALUETYPE; + MemI != MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++MemI) { + MVT MemVT = (MVT::SimpleValueType)MemI; + + // The only legal types are vectors smaller than 64bit. + if (MemVT.getSizeInBits() > 64) + continue; + + // 1-element vectors aren't legal. + if (MemVT.getVectorNumElements() == 1) + continue; + + // Some but not all of the [SZ]EXTLOADs are legal. + // For instance, on AVX, this isn't legal: + // v4i64 load, zext from v4i8 + // but this is: + // v4i32 load, zext from v4i8 + for (int ValI = MVT::FIRST_INTEGER_VECTOR_VALUETYPE; + ValI <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++ValI) { + MVT ValVT = (MVT::SimpleValueType)ValI; + + // The result type must be a legal SSE vector. + if (!ValVT.is128BitVector()) + continue; + + // The memory elt type must be smaller than the result. + if (ValVT.getVectorElementType().bitsLE(MemVT.getVectorElementType())) + continue; + + // The memory type must have as many elements as the result. + if (ValVT.getVectorNumElements() != MemVT.getVectorNumElements()) + continue; + + setLoadExtAction(ISD::SEXTLOAD, ValVT, MemVT, Legal); + setLoadExtAction(ISD::ZEXTLOAD, ValVT, MemVT, Legal); + } + } + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -1309,6 +1348,39 @@ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + for (int MemI = MVT::FIRST_VECTOR_VALUETYPE; + MemI != MVT::LAST_VECTOR_VALUETYPE; ++MemI) { + MVT MemVT = (MVT::SimpleValueType)MemI; + + // The only legal types are vectors smaller than 128bit. + if (MemVT.getSizeInBits() > 128) + continue; + + // 1-element vectors aren't legal. + if (MemVT.getVectorNumElements() == 1) + continue; + + for (int ValI = MVT::FIRST_VECTOR_VALUETYPE; + ValI <= MVT::LAST_VECTOR_VALUETYPE; ++ValI) { + MVT ValVT = (MVT::SimpleValueType)ValI; + + // The result type must be a legal AVX vector. + if (!ValVT.is256BitVector()) + continue; + + // The memory elt type must be smaller than the result. + if (ValVT.getVectorElementType().bitsLE(MemVT.getVectorElementType())) + continue; + + // The memory type must have as many elements as the result. + if (ValVT.getVectorNumElements() != MemVT.getVectorNumElements()) + continue; + + setLoadExtAction(ISD::SEXTLOAD, ValVT, MemVT, Legal); + setLoadExtAction(ISD::ZEXTLOAD, ValVT, MemVT, Legal); + } + } } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6146,7 +6146,7 @@ defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; // AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns { +multiclass SS41I_pmovx_avx2_patterns { // Register-Register patterns def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BWYrr) VR128:$src)>; @@ -6163,6 +6163,22 @@ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), (!cast(OpcPrefix#DQYrr) VR128:$src)>; + // Simple Register-Memory patterns + def : Pat<(v16i16 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v8i32 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast(ExtTy#"extloadvi32") addr:$src)), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + // AVX2 Register-Memory patterns def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWYrm) addr:$src)>; @@ -6220,13 +6236,13 @@ } let Predicates = [HasAVX2] in { - defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; - defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; } // SSE4.1/AVX patterns. -multiclass SS41I_pmovx_patterns { +multiclass SS41I_pmovx_patterns { def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BWrr) VR128:$src)>; def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), @@ -6242,6 +6258,21 @@ def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), (!cast(OpcPrefix#DQrr) VR128:$src)>; + def : Pat<(v8i16 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v4i32 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi8") addr:$src)), + (!cast(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi16") addr:$src)), + (!cast(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast(ExtTy#"extloadvi32") addr:$src)), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), @@ -6304,13 +6335,13 @@ } let Predicates = [HasAVX] in { - defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/pointer-vector.ll =================================================================== --- test/CodeGen/X86/pointer-vector.ll +++ test/CodeGen/X86/pointer-vector.ll @@ -81,8 +81,7 @@ entry: %G = load <4 x i8>* %p ;CHECK: movl -;CHECK: pmovzxbd -;CHECK: pand +;CHECK: pmovzxbd (% %K = inttoptr <4 x i8> %G to <4 x i32*> ;CHECK: ret ret <4 x i32*> %K Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -193,8 +193,9 @@ ; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) ; CHECK-NEXT: movb $1, 2(%[[PTR1]]) -; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] -; CHECK-NEXT: pand {{.*}}, %[[X0]] +; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]] +; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]] +; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]] ; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x ; CHECK-NEXT: shrl %e[[R0]]x ; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x