Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -782,6 +782,7 @@ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -13948,15 +13949,21 @@ return SDValue(); } -static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); + // It's always cheaper to replace a xor+movd with xorps and simplifies further + // combines. + if (X86::isZeroNode(Op.getOperand(0))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. - unsigned SizeFactor = OpVT.getSizeInBits()/128; + unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); @@ -13965,9 +13972,13 @@ // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + + // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. + if (OpVT == MVT::v4i32) + return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } @@ -23317,7 +23328,7 @@ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); - case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -301,9 +301,8 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { ; SSE-LABEL: _clearupper2xi64b: ; SSE: # BB#0: -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] @@ -356,44 +355,7 @@ define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind { ; SSE-LABEL: _clearupper8xi16b: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllw $8, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pslld $24, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllq $40, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllq $56, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper8xi16b: Index: test/CodeGen/X86/insertelement-zero.ll =================================================================== --- test/CodeGen/X86/insertelement-zero.ll +++ test/CodeGen/X86/insertelement-zero.ll @@ -244,24 +244,21 @@ define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) { ; SSE2-LABEL: insert_v4i32_01z3: ; SSE2: # BB#0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v4i32_01z3: ; SSE3: # BB#0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: xorps %xmm1, %xmm1 ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v4i32_01z3: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: xorps %xmm1, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq @@ -292,8 +289,7 @@ ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE2-NEXT: retq @@ -302,8 +298,7 @@ ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm2, %xmm2 ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: xorps %xmm2, %xmm2 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE3-NEXT: retq @@ -312,8 +307,7 @@ ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm2, %xmm2 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: xorps %xmm2, %xmm2 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSSE3-NEXT: retq @@ -443,24 +437,12 @@ define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { ; SSE2-LABEL: insert_v16i8_z123456789ABCDEz: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_z123456789ABCDEz: ; SSE3: # BB#0: -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz: @@ -489,25 +471,13 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; SSE2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: ; SSE3: # BB#0: -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm2, %xmm0 -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: pandn %xmm3, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE3-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE3-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-variable-128.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-variable-128.ll +++ test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -1329,28 +1329,27 @@ ; SSE2-NEXT: andl $7, %ecx ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r8d -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r9d ; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: retq ; @@ -1368,28 +1367,27 @@ ; SSSE3-NEXT: andl $7, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r8d -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r9d ; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSSE3-NEXT: xorl %esi, %esi -; SSSE3-NEXT: movd %esi, %xmm0 ; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: retq ;