Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -16650,7 +16650,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); - ArrayRef Mask = SVOp->getMask(); + ArrayRef OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); @@ -16676,8 +16676,8 @@ // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && - any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { - SmallVector NewMask(Mask.begin(), Mask.end()); + any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector NewMask(OrigMask.begin(), OrigMask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; @@ -16685,15 +16685,16 @@ } // Check for illegal shuffle mask element index values. - int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; - assert(llvm::all_of(Mask, + int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); + (void)MaskUpperLimit; + assert(llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2); if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); @@ -16701,11 +16702,11 @@ // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. - SmallVector ZeroableMask(Mask.begin(), Mask.end()); + SmallVector ZeroableMask(OrigMask.begin(), OrigMask.end()); if (V2IsZero) { assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); for (int i = 0; i != NumElements; ++i) - if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } @@ -16720,7 +16721,7 @@ // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; @@ -16756,8 +16757,11 @@ } // Commute the shuffle if it will improve canonicalization. - if (canonicalizeShuffleMaskWithCommute(Mask)) - return DAG.getCommutedVectorShuffle(*SVOp); + SmallVector Mask(OrigMask.begin(), OrigMask.end()); + if (canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; Index: llvm/trunk/test/CodeGen/X86/oddshuffles.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/oddshuffles.ll +++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll @@ -1056,7 +1056,7 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vmovdqu %xmm3, (%rsi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) @@ -1094,8 +1094,8 @@ ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11] ; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,14,15,0,1,2,3] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13],xmm2[2,3,8,9,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) @@ -1187,7 +1187,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1233,9 +1233,8 @@ ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15] ; XOP-NEXT: vmovdqu %xmm0, 32(%rdi) ; XOP-NEXT: vmovups %ymm3, (%rdi) ; XOP-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1323,8 +1323,8 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpsllq $48, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 @@ -1334,8 +1334,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -718,20 +718,23 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) { ; SSE2-LABEL: shuffle_v2i64_z1: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm1, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_z1: ; SSE3: # %bb.0: -; SSE3-NEXT: xorpd %xmm1, %xmm1 -; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_z1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorpd %xmm1, %xmm1 -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_z1: Index: llvm/trunk/test/CodeGen/X86/vector-zext-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext-widen.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext-widen.ll @@ -1999,19 +1999,16 @@ ; ; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -2057,19 +2057,16 @@ ; ; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq