Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -409,7 +409,7 @@ void getStoreMergeAndAliasCandidates( StoreSDNode* St, SmallVectorImpl &StoreNodes, SmallVectorImpl &AliasLoadNodes); - + /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return True if some memory operations were changed. @@ -5567,7 +5567,7 @@ SDLoc(N)); } -/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or +/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). @@ -8403,7 +8403,7 @@ SDValue RV = BuildRsqrtEstimate(N->getOperand(0)); if (!RV) return SDValue(); - + EVT VT = RV.getValueType(); SDLoc DL(N); RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV); @@ -10850,12 +10850,12 @@ // We need to make sure that these nodes do not interfere with // any of the store nodes. SmallVector AliasLoadNodes; - + // Save the StoreSDNodes that we find in the chain. SmallVector StoreNodes; getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); - + // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; @@ -12985,32 +12985,73 @@ RHS = RHS.getOperand(0); if (RHS.getOpcode() == ISD::BUILD_VECTOR) { - SmallVector Indices; + EVT RVT = RHS.getValueType(); unsigned NumElts = RHS.getNumOperands(); - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Elt = RHS.getOperand(i); - if (isAllOnesConstant(Elt)) - Indices.push_back(i); - else if (isNullConstant(Elt)) - Indices.push_back(NumElts+i); - else + // Attempt to create a valid clear mask, splitting the mask into + // sub elements and checking to see if each is + // all zeros or all ones - suitable for shuffle masking. + auto BuildClearMask = [&](unsigned Split) { + unsigned NumSubElts = NumElts * Split; + unsigned NumSubBits = RVT.getScalarSizeInBits() / Split; + + SmallVector Indices; + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned EltIdx = i / Split; + unsigned SubIdx = i % Split; + SDValue Elt = RHS.getOperand(EltIdx); + if (Elt.getOpcode() == ISD::UNDEF) { + Indices.push_back(-1); + continue; + } + + APInt Bits; + if (isa(Elt)) + Bits = cast(Elt)->getAPIntValue(); + else if(isa(Elt)) + Bits = cast(Elt)->getValueAPF().bitcastToAPInt(); + else + return SDValue(); + + // Extract the sub element from the constant bit mask. + if (DAG.getDataLayout().isBigEndian()) { + Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits); + } else { + Bits = Bits.lshr(SubIdx * NumSubBits); + } + + if (Split > 1) + Bits = Bits.trunc(NumSubBits); + + if (Bits.isAllOnesValue()) + Indices.push_back(i); + else if (Bits == 0) + Indices.push_back(i + NumSubElts); + else + return SDValue(); + } + + // Let's see if the target supports this vector_shuffle. + EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); + EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); + if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) return SDValue(); - } - // Let's see if the target supports this vector_shuffle. - EVT RVT = RHS.getValueType(); - if (!TLI.isVectorClearMaskLegal(Indices, RVT)) - return SDValue(); + SDValue Zero = DAG.getConstant(0, dl, ClearVT); + return DAG.getBitcast( + VT, DAG.getVectorShuffle(ClearVT, dl, DAG.getBitcast(ClearVT, LHS), + Zero, &Indices[0])); + }; - // Return the new VECTOR_SHUFFLE node. - EVT EltVT = RVT.getVectorElementType(); - SmallVector ZeroOps(RVT.getVectorNumElements(), - DAG.getConstant(0, dl, EltVT)); - SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps); - LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS); - SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuf); + // Determine maximum split level (byte level masking). + unsigned MaxSplit = 1; + if (RVT.getScalarSizeInBits() % 8 == 0) + MaxSplit = RVT.getScalarSizeInBits() / 8; + + for (unsigned Split = 1; Split <= MaxSplit; ++Split) + if (RVT.getScalarSizeInBits() % Split == 0) + if (SDValue S = BuildClearMask(Split)) + return S; } return SDValue(); @@ -13024,9 +13065,6 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if (SDValue Shuffle = XformToShuffleWithZero(N)) - return Shuffle; - // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold // this operation. if (LHS.getOpcode() == ISD::BUILD_VECTOR && @@ -13077,6 +13115,10 @@ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops); } + // Try to convert a constant mask AND into a shuffle clear mask. + if (SDValue Shuffle = XformToShuffleWithZero(N)) + return Shuffle; + // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6502,136 +6502,6 @@ return DAG.getNode(ISD::OR, DL, VT, V1, V2); } -/// \brief Try to emit a blend instruction for a shuffle. -/// -/// This doesn't do any checks for the availability of instructions for blending -/// these values. It relies on the availability of the X86ISD::BLENDI pattern to -/// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is in fact a blend. -static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - unsigned BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { - if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! - BlendMask |= 1u << i; - continue; - } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! - } - switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); - - case MVT::v4i64: - case MVT::v8i32: - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - // FALLTHROUGH - case MVT::v2i64: - case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget->hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - // FALLTHROUGH - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - - case MVT::v16i16: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { - // We can lower these with PBLENDW which is mirrored across 128-bit lanes. - assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); - BlendMask = 0; - for (int i = 0; i < 8; ++i) - if (RepeatedMask[i] >= 16) - BlendMask |= 1u << i; - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); - } - } - // FALLTHROUGH - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); - - // Scale the blend by the number of bytes per element. - int Scale = VT.getScalarSizeInBits() / 8; - - // This form of blend is always done on bytes. Compute the byte vector - // type. - MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); - - // Compute the VSELECT mask. Note that VSELECT is really confusing in the - // mix of LLVM's code generator and the x86 backend. We tell the code - // generator that boolean values in the elements of an x86 vector register - // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' - // mapping a select to operand #1, and 'false' mapping to operand #2. The - // reality in x86 is that vector masks (pre-AVX-512) use only the high bit - // of the element (the remaining are ignored) and 0 in that high bit would - // mean operand #1 while 1 in the high bit would mean operand #2. So while - // the LLVM model for boolean values in vector elements gets the relevant - // bit set, it is set backwards and over constrained relative to x86's - // actual model. - SmallVector VSELECTMask; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - for (int j = 0; j < Scale; ++j) - VSELECTMask.push_back( - Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, - MVT::i8)); - - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getNode(ISD::BUILD_VECTOR, DL, - BlendVT, VSELECTMask), - V1, V2)); - } - - default: - llvm_unreachable("Not a supported integer vector type!"); - } -} - /// \brief Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// @@ -6920,6 +6790,139 @@ return V; } +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is in fact a blend. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] >= Size) { + if (Mask[i] != i + Size) + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; + } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! + } + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v16i8: + case MVT::v32i8: { + assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + "256-bit byte-blends require AVX2 support!"); + + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + + // This form of blend is always done on bytes. Compute the byte vector + // type. + MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SmallVector VSELECTMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask.push_back( + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, + MVT::i8)); + + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getNode(ISD::BUILD_VECTOR, DL, + BlendVT, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } +} + /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -9087,6 +9090,10 @@ return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {// Low half. 0, 16, 1, 17, 2, 18, 3, 19, @@ -15404,7 +15411,7 @@ SDValue Mask = Op.getOperand(3); SDValue RoundingMode; // We allways add rounding mode to the Node. - // If the rounding mode is not specified, we add the + // If the rounding mode is not specified, we add the // "current direction" mode. if (Op.getNumOperands() == 4) RoundingMode = Index: test/CodeGen/X86/avx2-conversions.ll =================================================================== --- test/CodeGen/X86/avx2-conversions.ll +++ test/CodeGen/X86/avx2-conversions.ll @@ -62,8 +62,7 @@ ; CHECK-LABEL: zext_8i8_8i32: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -192,8 +192,8 @@ ; SKX-LABEL: test15: ; SKX: ## BB#0: -; SKX-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq Index: test/CodeGen/X86/sse2-vector-shifts.ll =================================================================== --- test/CodeGen/X86/sse2-vector-shifts.ll +++ test/CodeGen/X86/sse2-vector-shifts.ll @@ -313,6 +313,7 @@ ; CHECK-LABEL: @shl_zext_srl_v4i32 ; CHECK: andps +; CHECK: andps ; CHECK-NEXT: ret define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind { %srl = lshr <4 x i16> %x, Index: test/CodeGen/X86/sse3.ll =================================================================== --- test/CodeGen/X86/sse3.ll +++ test/CodeGen/X86/sse3.ll @@ -269,8 +269,10 @@ define <4 x i32> @t17() nounwind { ; X64-LABEL: t17: ; X64: ## BB#0: ## %entry -; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; X64-NEXT: andpd {{.*}}(%rip), %xmm0 +; X64-NEXT: movaps (%rax), %xmm0 +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, <4 x float>* undef, align 16 Index: test/CodeGen/X86/vec_cast2.ll =================================================================== --- test/CodeGen/X86/vec_cast2.ll +++ test/CodeGen/X86/vec_cast2.ll @@ -46,10 +46,12 @@ define <8 x float> @foo2_8(<8 x i8> %src) { ; CHECK-LABEL: foo2_8: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -455,18 +455,14 @@ define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { ; SSE-LABEL: uitofp_2i16_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: uitofp_2i16_to_2f64: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> @@ -503,19 +499,15 @@ define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { ; SSE-LABEL: uitofp_2i8_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: uitofp_2i8_to_2f64: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> @@ -632,31 +624,31 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE-LABEL: uitofp_4i32_to_4f64: ; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] -; SSE-NEXT: addpd %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: subpd %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] +; SSE-NEXT: subpd %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: subpd %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: subpd %xmm4, %xmm2 +; SSE-NEXT: subpd %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] ; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: subpd %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] -; SSE-NEXT: addpd %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: subpd %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; SSE-NEXT: addpd %xmm4, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: retq ; @@ -1725,8 +1717,7 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1767,18 +1758,19 @@ ; AVX1-LABEL: uitofp_8i8_to_8f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_8i8_to_8f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: retq %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -39,8 +39,7 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_16i8_to_16i16: @@ -48,16 +47,15 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_16i16: @@ -125,34 +123,31 @@ ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_8i32: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_8i32: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_8i32: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -163,28 +158,25 @@ define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_2i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_16i8_to_2i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_2i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_16i8_to_2i64: ; AVX: # BB#0: # %entry ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> @@ -195,55 +187,48 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_4i64: ; SSE2: # BB#0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_16i8_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_4i64: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> @@ -285,8 +270,7 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i16_to_8i32: @@ -294,16 +278,15 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i16_to_8i32: @@ -326,28 +309,26 @@ define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_8i16_to_2i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i16_to_2i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i16_to_2i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_8i16_to_2i64: ; AVX: # BB#0: # %entry ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> @@ -358,52 +339,49 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_8i16_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i16_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5],zero,zero,zero,zero,zero,zero,xmm1[6,7],zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i16_to_4i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535] -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7] -; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i16_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_4i64: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> @@ -414,26 +392,24 @@ define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_4i32_to_2i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_4i32_to_2i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_4i32_to_2i64: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: zext_4i32_to_2i64: ; AVX: # BB#0: # %entry ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> @@ -444,32 +420,26 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_4i32_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_4i32_to_4i64: @@ -561,7 +531,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -572,11 +542,9 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_4i8_to_4i64: @@ -639,7 +607,7 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -650,11 +618,8 @@ ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i8_to_8i32: @@ -687,8 +652,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_16i8_to_16i16: @@ -697,8 +661,7 @@ ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_16i8_to_16i16: @@ -792,7 +755,7 @@ ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -802,11 +765,9 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_4i16_to_4i64: @@ -839,8 +800,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_8i16_to_8i32: @@ -849,8 +809,7 @@ ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i16_to_8i32: @@ -910,21 +869,19 @@ ; SSE2-LABEL: load_zext_4i32_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_4i32_to_4i64: @@ -955,7 +912,7 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pand %xmm0, %xmm1 @@ -966,7 +923,7 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pand %xmm0, %xmm1 @@ -976,7 +933,7 @@ ; SSE41-LABEL: zext_8i8_to_8i32: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pand %xmm0, %xmm1 @@ -985,17 +942,18 @@ ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %t = zext <8 x i8> %z to <8 x i32> @@ -1092,17 +1050,16 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { ; SSE2-LABEL: shuf_zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuf_zext_8i8_to_8i32: Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -62,13 +62,15 @@ ; CHECK-LABEL: test3: ; Compute the mask. -; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]] +; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]] ; Do not shrink the bit of the mask. -; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}} +; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}} ; Use the mask in the blend. -; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} -; Use the mask in the and. -; CHECK-NEXT: vpand LCPI2_2(%rip), [[MASK]], {{%xmm[0-9]+}} +; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; Shuffle mask to truncate. +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} ; CHECK: retq define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { %tmp6 = srem <4 x i32> %induction30, Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -191,9 +191,7 @@ ; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) ; CHECK-NEXT: movb $1, 2(%[[PTR1]]) -; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]] -; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]] -; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]] +; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] ; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]] ; CHECK-NEXT: psrld $1, %[[X1]] ; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]]