Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7982,6 +7982,28 @@ return V; } + // General extends failed, but 128-bit vectors may be able to use MOVQ. + if (Bits != 128) + return SDValue(); + + auto MatchMOVQ = [&]() -> SDValue { + // MOVQ copies the lower 64-bits of a vector and zeros the upper 64-bits. + for (int i = NumElements / 2; i != NumElements; i++) + if (!Zeroable[i]) + return SDValue(); + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) + return V1; + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) + return V2; + return SDValue(); + }; + + if (SDValue MOVQ = MatchMOVQ()) { + MOVQ = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, MOVQ); + MOVQ = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, MOVQ); + return DAG.getNode(ISD::BITCAST, DL, VT, MOVQ); + } + // No viable ext lowering found. return SDValue(); } @@ -13199,14 +13221,14 @@ SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); - + SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && SubVecVT.is128BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); @@ -24799,7 +24821,7 @@ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), @@ -24829,7 +24851,7 @@ "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. - assert (((NumElems * FromSz) % ToSz) == 0 && + assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; Index: test/CodeGen/X86/combine-or.ll =================================================================== --- test/CodeGen/X86/combine-or.ll +++ test/CodeGen/X86/combine-or.ll @@ -207,11 +207,10 @@ ; CHECK-LABEL: test17: ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; CHECK-NEXT: orps %xmm1, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: movq %xmm1, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -460,27 +460,25 @@ ; SSE2: # BB#0: ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR20540: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR20540: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: PR20540: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle