Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5728,11 +5728,13 @@ int LoadSize = (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); - // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. - if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && + // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && + (LoadSize == 32 || LoadSize == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64); + MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) + : MVT::getIntegerVT(LoadSize); + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -5759,20 +5761,6 @@ } } - // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. - if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && - ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32; - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32); - if (TLI.isTypeLegal(VecVT)) { - SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase) - : DAG.getBitcast(VecSVT, EltBase); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); - return DAG.getBitcast(VT, V); - } - } - return SDValue(); } Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3202,6 +3202,8 @@ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -3224,6 +3226,8 @@ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; @@ -3238,6 +3242,8 @@ def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; @@ -3367,17 +3373,20 @@ let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVZPQILo2PQIZrm addr:$src)>; + (VMOVZPQILo2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), - (VMOVZPQILo2PQIZrr VR128X:$src)>; + (VMOVZPQILo2PQIZrr VR128X:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVZPQILo2PQIZrm addr:$src)>; + (VMOVZPQILo2PQIZrm addr:$src)>; def : Pat<(v4i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; } @@ -3386,12 +3395,13 @@ def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. + def : Pat<(v16i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; } Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -590,6 +590,8 @@ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -609,6 +611,8 @@ def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; @@ -697,6 +701,8 @@ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; } // Extract and store. @@ -4876,9 +4882,13 @@ (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIrm addr:$src)>; def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, @@ -4901,6 +4911,8 @@ (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (MOVDI2PDIrm addr:$src)>; } }