Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -33675,6 +33675,26 @@ Movl, N->getOperand(0).getOperand(2)); } + // If this a vzmovl of a full vector load, replace it with a vzload, unless + // the load is volatile. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && + ISD::isNormalLoad(N->getOperand(0).getNode())) { + LoadSDNode *LN = cast(N->getOperand(0)); + if (!LN->isVolatile()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + VT.getVectorElementType(), + LN->getPointerInfo(), + LN->getAlignment(), + MachineMemOperand::MOLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 0), VZLoad.getValue(1)); + return VZLoad; + } + } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4317,15 +4317,11 @@ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), (VMOVSSZrm addr:$src)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (VMOVSSZrm addr:$src)>; // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), (VMOVSDZrm addr:$src)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (VMOVSDZrm addr:$src)>; // Represent the same patterns above but in the form they appear for // 256-bit types @@ -4363,14 +4359,10 @@ (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v8i32 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), (VMOVZPQILo2PQIZrr VR128X:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -267,8 +267,6 @@ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), (VMOVSSrm addr:$src)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (VMOVSSrm addr:$src)>; def : Pat<(v4f32 (X86vzload addr:$src)), (VMOVSSrm addr:$src)>; @@ -276,8 +274,6 @@ // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), (VMOVSDrm addr:$src)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (VMOVSDrm addr:$src)>; def : Pat<(v2f64 (X86vzload addr:$src)), (VMOVSDrm addr:$src)>; @@ -321,16 +317,12 @@ // MOVSSrm already zeros the high parts of the register. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), (MOVSSrm addr:$src)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (MOVSSrm addr:$src)>; } let Predicates = [UseSSE2] in { // MOVSDrm already zeros the high parts of the register. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), (MOVSDrm addr:$src)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (MOVSDrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -4145,8 +4137,6 @@ (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIrm addr:$src)>; def : Pat<(v8i32 (X86vzload addr:$src)), @@ -4163,8 +4153,6 @@ (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (MOVDI2PDIrm addr:$src)>; } @@ -4233,8 +4221,6 @@ (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; let Predicates = [UseAVX] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (VMOVQI2PQIrm addr:$src)>; def : Pat<(v4i64 (X86vzload addr:$src)), @@ -4245,8 +4231,6 @@ } let Predicates = [UseSSE2] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (MOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), Index: llvm/trunk/test/CodeGen/X86/vector-zmov.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zmov.ll +++ llvm/trunk/test/CodeGen/X86/vector-zmov.ll @@ -37,16 +37,33 @@ ret <2 x i64>%Y } -; FIXME: We shouldn't shrink the load to movss here since it is volatile. define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(<4 x i32> *%ptr) { -; SSE-LABEL: load_zmov_4i32_to_0zzz_volatile: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: retq +; SSE2-LABEL: load_zmov_4i32_to_0zzz_volatile: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm1 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zmov_4i32_to_0zzz_volatile: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps (%rdi), %xmm1 +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zmov_4i32_to_0zzz_volatile: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movaps (%rdi), %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq entry: %X = load volatile <4 x i32>, <4 x i32>* %ptr @@ -54,16 +71,17 @@ ret <4 x i32>%Y } -; FIXME: We shouldn't shrink the load to movsd here since it is volatile. define <2 x i64> @load_zmov_2i64_to_0z_volatile(<2 x i64> *%ptr) { ; SSE-LABEL: load_zmov_2i64_to_0z_volatile: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: load_zmov_2i64_to_0z_volatile: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq entry: %X = load volatile <2 x i64>, <2 x i64>* %ptr