Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25647,6 +25647,48 @@ return OptimizeConditionalInDecrement(N, DAG); } +/// performVSZEXTCombine - Performs VSEXT/VZEXT combines +static SDValue performVSZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + MVT MemVT = MVT::getVectorVT(OpEltVT, NumElts); + + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) { + if (!V.hasOneUse()) + return SDValue(); + V = V.getOperand(0); + } + + // (vzext (load p)) -> (zextload p) + // (vzext (vzload p)) -> (zextload p) + // (vzext (scalar_to_vector (load p))) -> (zextload p) + // (vzext (vzmovl (scalar_to_vector (load p)))) -> (zextload p) + MemSDNode *Ld = nullptr; + if (V.getOpcode() == X86ISD::VZEXT_LOAD) + Ld = cast(V); + else if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + Ld = dyn_cast(V.getOperand(0)); + else if (V.getOpcode() == X86ISD::VZEXT_MOVL && + V.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).hasOneUse()) + Ld = dyn_cast(V.getOperand(0).getOperand(0)); + + if (Ld && Ld->hasOneUse() && V.hasOneUse()) + return DAG.getExtLoad( + N->getOpcode() == X86ISD::VZEXT ? ISD::ZEXTLOAD : ISD::SEXTLOAD, DL, VT, + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, + Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + + return SDValue(); +} + /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25710,7 +25752,7 @@ } } - return SDValue(); + return performVSZEXTCombine(N, DAG, DCI); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -25757,6 +25799,7 @@ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VSEXT: return performVSZEXTCombine(N, DAG, DCI); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6172,55 +6172,25 @@ // AVX2 Register-Memory patterns def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), - (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#DQYrm) addr:$src)>; } @@ -6263,63 +6233,21 @@ def : Pat<(v2i64 (!cast(ExtTy#"extloadvi32") addr:$src)), (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), - (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), - (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), - (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), - (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), - (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), - (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), - (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#DQrm) addr:$src)>; } Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -791,9 +791,8 @@ ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: pmovzxbd (%eax), %xmm1 +; X32-SSE41-NEXT: pmovzxdq %xmm1, %xmm2 ; X32-SSE41-NEXT: movd %xmm2, %eax ; X32-SSE41-NEXT: movsbl %al, %eax ; X32-SSE41-NEXT: movd %eax, %xmm0 @@ -910,9 +909,8 @@ ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movsd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2 +; X32-SSE41-NEXT: pmovzxwd (%eax), %xmm1 +; X32-SSE41-NEXT: pmovzxdq %xmm1, %xmm2 ; X32-SSE41-NEXT: movd %xmm2, %eax ; X32-SSE41-NEXT: cwtl ; X32-SSE41-NEXT: movd %eax, %xmm0