Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1141,6 +1141,45 @@ Custom); } + // SSE41 also brings vector sign/zero extending loads. + for (int MemI = MVT::FIRST_INTEGER_VECTOR_VALUETYPE; + MemI != MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++MemI) { + MVT MemVT = (MVT::SimpleValueType)MemI; + + // The only legal types are vectors smaller than 64bit. + if (MemVT.getSizeInBits() > 64) + continue; + + // 1-element vectors aren't legal. + if (MemVT.getVectorNumElements() == 1) + continue; + + // Some but not all of the [SZ]EXTLOADs are legal. + // For instance, on AVX, this isn't legal: + // v4i64 load, zext from v4i8 + // but this is: + // v4i32 load, zext from v4i8 + for (int ValI = MVT::FIRST_INTEGER_VECTOR_VALUETYPE; + ValI <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++ValI) { + MVT ValVT = (MVT::SimpleValueType)ValI; + + // The result type must be a legal SSE vector. + if (!ValVT.is128BitVector()) + continue; + + // The memory elt type must be smaller than the result. + if (ValVT.getVectorElementType().bitsLE(MemVT.getVectorElementType())) + continue; + + // The memory type must have as many elements as the result. + if (ValVT.getVectorNumElements() != MemVT.getVectorNumElements()) + continue; + + setLoadExtAction(ISD::SEXTLOAD, ValVT, MemVT, Legal); + setLoadExtAction(ISD::ZEXTLOAD, ValVT, MemVT, Legal); + } + } + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -1312,6 +1351,39 @@ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + for (int MemI = MVT::FIRST_VECTOR_VALUETYPE; + MemI != MVT::LAST_VECTOR_VALUETYPE; ++MemI) { + MVT MemVT = (MVT::SimpleValueType)MemI; + + // The only legal types are vectors smaller than 128bit. + if (MemVT.getSizeInBits() > 128) + continue; + + // 1-element vectors aren't legal. + if (MemVT.getVectorNumElements() == 1) + continue; + + for (int ValI = MVT::FIRST_VECTOR_VALUETYPE; + ValI <= MVT::LAST_VECTOR_VALUETYPE; ++ValI) { + MVT ValVT = (MVT::SimpleValueType)ValI; + + // The result type must be a legal AVX vector. + if (!ValVT.is256BitVector()) + continue; + + // The memory elt type must be smaller than the result. + if (ValVT.getVectorElementType().bitsLE(MemVT.getVectorElementType())) + continue; + + // The memory type must have as many elements as the result. + if (ValVT.getVectorNumElements() != MemVT.getVectorNumElements()) + continue; + + setLoadExtAction(ISD::SEXTLOAD, ValVT, MemVT, Legal); + setLoadExtAction(ISD::ZEXTLOAD, ValVT, MemVT, Legal); + } + } } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6222,6 +6222,36 @@ let Predicates = [HasAVX2] in { defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; + + def : Pat<(v4i64 (zextloadvi8 addr:$src)), + (VPMOVZXBQYrm addr:$src)>; + def : Pat<(v8i32 (zextloadvi8 addr:$src)), + (VPMOVZXBDYrm addr:$src)>; + def : Pat<(v16i16 (zextloadvi8 addr:$src)), + (VPMOVZXBWYrm addr:$src)>; + + def : Pat<(v4i64 (zextloadvi16 addr:$src)), + (VPMOVZXWQYrm addr:$src)>; + def : Pat<(v8i32 (zextloadvi16 addr:$src)), + (VPMOVZXWDYrm addr:$src)>; + + def : Pat<(v4i64 (zextloadvi32 addr:$src)), + (VPMOVZXDQYrm addr:$src)>; + + def : Pat<(v4i64 (sextloadvi8 addr:$src)), + (VPMOVSXBQYrm addr:$src)>; + def : Pat<(v8i32 (sextloadvi8 addr:$src)), + (VPMOVSXBDYrm addr:$src)>; + def : Pat<(v16i16 (sextloadvi8 addr:$src)), + (VPMOVSXBWYrm addr:$src)>; + + def : Pat<(v4i64 (sextloadvi16 addr:$src)), + (VPMOVSXWQYrm addr:$src)>; + def : Pat<(v8i32 (sextloadvi16 addr:$src)), + (VPMOVSXWDYrm addr:$src)>; + + def : Pat<(v4i64 (sextloadvi32 addr:$src)), + (VPMOVSXDQYrm addr:$src)>; } // SSE4.1/AVX patterns. @@ -6306,11 +6336,71 @@ let Predicates = [HasAVX] in { defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; + + def : Pat<(v2i64 (zextloadvi8 addr:$src)), + (VPMOVZXBQrm addr:$src)>; + def : Pat<(v4i32 (zextloadvi8 addr:$src)), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(v8i16 (zextloadvi8 addr:$src)), + (VPMOVZXBWrm addr:$src)>; + + def : Pat<(v2i64 (zextloadvi16 addr:$src)), + (VPMOVZXWQrm addr:$src)>; + def : Pat<(v4i32 (zextloadvi16 addr:$src)), + (VPMOVZXWDrm addr:$src)>; + + def : Pat<(v2i64 (zextloadvi32 addr:$src)), + (VPMOVZXDQrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi8 addr:$src)), + (VPMOVSXBQrm addr:$src)>; + def : Pat<(v4i32 (sextloadvi8 addr:$src)), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(v8i16 (sextloadvi8 addr:$src)), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi16 addr:$src)), + (VPMOVSXWQrm addr:$src)>; + def : Pat<(v4i32 (sextloadvi16 addr:$src)), + (VPMOVSXWDrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi32 addr:$src)), + (VPMOVSXDQrm addr:$src)>; } let Predicates = [UseSSE41] in { defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; + + def : Pat<(v2i64 (zextloadvi8 addr:$src)), + (PMOVZXBQrm addr:$src)>; + def : Pat<(v4i32 (zextloadvi8 addr:$src)), + (PMOVZXBDrm addr:$src)>; + def : Pat<(v8i16 (zextloadvi8 addr:$src)), + (PMOVZXBWrm addr:$src)>; + + def : Pat<(v2i64 (zextloadvi16 addr:$src)), + (PMOVZXWQrm addr:$src)>; + def : Pat<(v4i32 (zextloadvi16 addr:$src)), + (PMOVZXWDrm addr:$src)>; + + def : Pat<(v2i64 (zextloadvi32 addr:$src)), + (PMOVZXDQrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi8 addr:$src)), + (PMOVSXBQrm addr:$src)>; + def : Pat<(v4i32 (sextloadvi8 addr:$src)), + (PMOVSXBDrm addr:$src)>; + def : Pat<(v8i16 (sextloadvi8 addr:$src)), + (PMOVSXBWrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi16 addr:$src)), + (PMOVSXWQrm addr:$src)>; + def : Pat<(v4i32 (sextloadvi16 addr:$src)), + (PMOVSXWDrm addr:$src)>; + + def : Pat<(v2i64 (sextloadvi32 addr:$src)), + (PMOVSXDQrm addr:$src)>; } //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/pointer-vector.ll =================================================================== --- test/CodeGen/X86/pointer-vector.ll +++ test/CodeGen/X86/pointer-vector.ll @@ -81,8 +81,7 @@ entry: %G = load <4 x i8>* %p ;CHECK: movl -;CHECK: pmovzxbd -;CHECK: pand +;CHECK: pmovzxbd (% %K = inttoptr <4 x i8> %G to <4 x i32*> ;CHECK: ret ret <4 x i32*> %K Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -194,7 +194,6 @@ ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) ; CHECK-NEXT: movb $1, 2(%[[PTR1]]) ; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] -; CHECK-NEXT: pand {{.*}}, %[[X0]] ; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x ; CHECK-NEXT: shrl %e[[R0]]x ; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x