Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -33658,6 +33658,22 @@ } } + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && + N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && + N->getOperand(0).hasOneUse() && + N->getOperand(0).getOperand(0).isUndef() && + isNullConstant(N->getOperand(0).getOperand(2))) { + SDValue In = N->getOperand(0).getOperand(1); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), + Movl, N->getOperand(0).getOperand(2)); + } + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4329,39 +4329,17 @@ // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; // Represent the same patterns above but in the form they appear for // 512-bit types - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v16f32 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; def : Pat<(v8f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; } let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { @@ -4380,14 +4358,6 @@ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIZrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; @@ -4408,14 +4378,6 @@ def : Pat<(v4i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. def : Pat<(v16i32 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -283,14 +283,8 @@ // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } @@ -4145,9 +4139,6 @@ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), @@ -4158,15 +4149,8 @@ (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; def : Pat<(v8i32 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; } let Predicates = [UseSSE2] in { @@ -4253,9 +4237,6 @@ (VMOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; Index: llvm/trunk/test/CodeGen/X86/avx-load-store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-load-store.ll +++ llvm/trunk/test/CodeGen/X86/avx-load-store.ll @@ -240,6 +240,7 @@ ; CHECK_O0-NEXT: .LBB9_3: # %cif_mixed_test_all ; CHECK_O0-NEXT: movl $-1, %eax ; CHECK_O0-NEXT: vmovd %eax, %xmm0 +; CHECK_O0-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK_O0-NEXT: # implicit-def: $rcx ; CHECK_O0-NEXT: # implicit-def: $ymm2 Index: llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll +++ llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll @@ -144,17 +144,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqu (%ecx), %xmm0 -; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; X32-NEXT: vmovdqa %ymm0, (%eax) +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2i64_4i64: ; X64: # %bb.0: -; X64-NEXT: vmovdqu (%rdi), %xmm0 -; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; X64-NEXT: vmovdqa %ymm0, (%rsi) +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x i64>, <2 x i64>* %in, align 8 @@ -196,17 +194,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovdqu (%ecx), %xmm0 -; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; X32-NEXT: vmovdqa %ymm0, (%eax) +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2f64_4f64: ; X64: # %bb.0: -; X64-NEXT: vmovdqu (%rdi), %xmm0 -; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; X64-NEXT: vmovdqa %ymm0, (%rsi) +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x double>, <2 x double>* %in, align 8 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1514,7 +1514,6 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0