Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -30399,6 +30399,13 @@ // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. if (VT == MVT::x86mmx) { + // Detect zero MMX vectors. + if (X86::isZeroNode(N0) || ISD::isBuildVectorAllZeros(N0.getNode())) { + SDLoc DL(N0); + return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT, + DAG.getConstant(0, DL, MVT::i32)); + } + // Detect bitcasts between i32 to x86mmx low word. if (N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -7910,6 +7910,8 @@ return Expand2AddrUndef(MIB, get(X86::SBB32rr)); case X86::SETB_C64r: return Expand2AddrUndef(MIB, get(X86::SBB64rr)); + case X86::MMX_SET0: + return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr)); case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: @@ -8877,6 +8879,7 @@ case X86::AVX512_128_SET0: Alignment = 16; break; + case X86::MMX_SET0: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: Alignment = 8; @@ -8910,6 +8913,7 @@ SmallVector MOs; switch (LoadMI.getOpcode()) { + case X86::MMX_SET0: case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: @@ -8957,6 +8961,8 @@ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8); + else if (Opc == X86::MMX_SET0) + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4); Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -90,6 +90,15 @@ >; } +// Alias instruction that maps zero vector to pxor mmx. +// This is expanded by ExpandPostRAPseudos to an pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { +def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>; +} + let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. @@ -235,9 +244,12 @@ let AddedComplexity = 15 in def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), (MMX_MOVD64rr GR32:$src)>; - let AddedComplexity = 20 in + let AddedComplexity = 20 in { + def : Pat<(x86mmx (MMX_X86movw2d (i32 0))), + (MMX_SET0)>; def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), (MMX_MOVD64rm addr:$src)>; + } } let mayStore = 1 in Index: test/CodeGen/X86/mmx-fold-zero.ll =================================================================== --- test/CodeGen/X86/mmx-fold-zero.ll +++ test/CodeGen/X86/mmx-fold-zero.ll @@ -8,15 +8,13 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movq 8(%ebp), %mm0 ; X86-NEXT: movq 16(%ebp), %mm5 -; X86-NEXT: movq %mm5, {{[0-9]+}}(%esp) # 8-byte Spill +; X86-NEXT: movq %mm5, (%esp) # 8-byte Spill ; X86-NEXT: movq %mm0, %mm3 ; X86-NEXT: paddd %mm5, %mm3 -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movdq2q %xmm0, %mm1 -; X86-NEXT: movq %mm1, (%esp) # 8-byte Spill +; X86-NEXT: pxor %mm1, %mm1 ; X86-NEXT: movq %mm3, %mm6 ; X86-NEXT: pmuludq %mm1, %mm6 ; X86-NEXT: movq 24(%ebp), %mm4 @@ -34,10 +32,10 @@ ; X86-NEXT: paddw %mm2, %mm0 ; X86-NEXT: paddw %mm6, %mm0 ; X86-NEXT: pmuludq %mm3, %mm0 -; X86-NEXT: paddw (%esp), %mm0 # 8-byte Folded Reload +; X86-NEXT: paddw {{\.LCPI.*}}, %mm0 ; X86-NEXT: paddw %mm1, %mm0 ; X86-NEXT: pmuludq %mm7, %mm0 -; X86-NEXT: pmuludq {{[0-9]+}}(%esp), %mm0 # 8-byte Folded Reload +; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload ; X86-NEXT: paddw %mm5, %mm0 ; X86-NEXT: paddw %mm2, %mm0 ; X86-NEXT: movq2dq %mm0, %xmm0 @@ -54,9 +52,7 @@ ; X64-NEXT: movq %mm5, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %mm0, %mm3 ; X64-NEXT: paddd %mm5, %mm3 -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: pxor %mm1, %mm1 ; X64-NEXT: movq %mm3, %mm6 ; X64-NEXT: pmuludq %mm1, %mm6 ; X64-NEXT: movdq2q %xmm2, %mm4 @@ -74,7 +70,7 @@ ; X64-NEXT: paddw %mm2, %mm0 ; X64-NEXT: paddw %mm6, %mm0 ; X64-NEXT: pmuludq %mm3, %mm0 -; X64-NEXT: paddw -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload +; X64-NEXT: paddw {{\.LCPI.*}}, %mm0 ; X64-NEXT: paddw %mm1, %mm0 ; X64-NEXT: pmuludq %mm7, %mm0 ; X64-NEXT: pmuludq -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload Index: test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-mmx.ll +++ test/CodeGen/X86/vector-shuffle-mmx.ll @@ -33,26 +33,22 @@ ; X32: ## %bb.0: ## %entry ; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: subl $16, %esp -; X32-NEXT: .cfi_def_cfa_offset 24 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: .cfi_offset %edi, -8 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, (%esp) -; X32-NEXT: movq (%esp), %mm0 +; X32-NEXT: pxor %mm0, %mm0 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 +; X32-NEXT: movsd %xmm0, (%esp) +; X32-NEXT: movq (%esp), %mm1 ; X32-NEXT: xorl %edi, %edi ; X32-NEXT: maskmovq %mm1, %mm0 -; X32-NEXT: addl $16, %esp +; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-NEXT: pxor %mm0, %mm0 ; X64-NEXT: movq {{.*}}(%rip), %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 @@ -99,16 +95,14 @@ define <4 x float> @pr35869() nounwind { ; X32-LABEL: pr35869: ; X32: ## %bb.0: -; X32-NEXT: subl $28, %esp +; X32-NEXT: subl $12, %esp ; X32-NEXT: movl $64, %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movq %xmm0, (%esp) -; X32-NEXT: pxor %xmm0, %xmm0 -; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: punpcklbw {{[0-9]+}}(%esp), %mm0 ## mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] -; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 +; X32-NEXT: pxor %mm1, %mm1 +; X32-NEXT: punpcklbw %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] +; X32-NEXT: pxor %xmm0, %xmm0 ; X32-NEXT: pcmpgtw %mm0, %mm1 ; X32-NEXT: movq %mm0, %mm2 ; X32-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3] @@ -116,7 +110,7 @@ ; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; X32-NEXT: punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1] ; X32-NEXT: cvtpi2ps %mm0, %xmm0 -; X32-NEXT: addl $28, %esp +; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; ; X64-LABEL: pr35869: @@ -124,12 +118,10 @@ ; X64-NEXT: movl $64, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-NEXT: punpcklbw -{{[0-9]+}}(%rsp), %mm0 ## mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpcklbw %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] +; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtw %mm0, %mm1 ; X64-NEXT: movq %mm0, %mm2 ; X64-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3]