diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -969,6 +969,7 @@ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -1175,10 +1176,6 @@ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } - // i8 vectors are custom because the source register and source - // source memory operand types are not the same width. - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); - if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can // do the pre and post work in the vector domain. @@ -19310,17 +19307,28 @@ bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); - // If we are inserting a element, see if we can do this more efficiently with - // a blend shuffle with a rematerializable vector than a costly integer - // insertion. - if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && - (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) { - SmallVector BlendMask; - for (unsigned i = 0; i != NumElts; ++i) - BlendMask.push_back(i == IdxVal ? i + NumElts : i); - SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) - : getOnesVector(VT, DAG, dl); - return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); + if (IsZeroElt || IsAllOnesElt) { + // Lower insertion of i8 -1 as an 'OR' blend. + // We don't deal with i8 0 since it appears to be handled elsewhere. + if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) { + SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType()); + SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType()); + SmallVector CstVectorElts(NumElts, ZeroCst); + CstVectorElts[IdxVal] = OnesCst; + SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts); + return DAG.getNode(ISD::OR, dl, VT, N0, CstVector); + } + // See if we can do this more efficiently with a blend shuffle with a + // rematerializable vector. + if (Subtarget.hasSSE41() && + (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) { + SmallVector BlendMask; + for (unsigned i = 0; i != NumElts; ++i) + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) + : getOnesVector(VT, DAG, dl); + return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); + } } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -311,36 +311,20 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE3-NEXT: por %xmm1, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i8_x123456789ABCDEx: @@ -364,48 +348,29 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: pandn %xmm3, %xmm2 -; SSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE3-NEXT: por %xmm4, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 -; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: por %xmm4, %xmm3 -; SSE3-NEXT: por %xmm3, %xmm1 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] +; SSE3-NEXT: orps %xmm2, %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: orps %xmm2, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] +; SSSE3-NEXT: orps %xmm2, %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: orps %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: