Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13773,7 +13773,33 @@ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: handle v16i8. + // TODO: We only extract a single element from v16i8, we can probably afford + // to be more aggressive here before using the default approach of spilling to + // stack. + if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { + // Extract either the lowest i32 or any i16, and extract the sub-byte. + int DWordIdx = IdxVal / 4; + if (DWordIdx == 0) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + DAG.getIntPtrConstant(DWordIdx, dl)); + int ShiftVal = (IdxVal % 4) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, + DAG.getConstant(ShiftVal, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + int WordIdx = IdxVal / 2; + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) Index: test/CodeGen/X86/extract-store.ll =================================================================== --- test/CodeGen/X86/extract-store.ll +++ test/CodeGen/X86/extract-store.ll @@ -6,8 +6,7 @@ define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) { ; SSE2-LABEL: extract_i8_0: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movb %al, (%rdi) ; SSE2-NEXT: retq ; @@ -28,9 +27,8 @@ define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) { ; SSE2-LABEL: extract_i8_15: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb %al, (%rdi) +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: movb %ah, (%rdi) # NOREX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract_i8_15: Index: test/CodeGen/X86/extractelement-index.ll =================================================================== --- test/CodeGen/X86/extractelement-index.ll +++ test/CodeGen/X86/extractelement-index.ll @@ -11,8 +11,9 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_1: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_1: @@ -33,8 +34,9 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_11: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_11: @@ -55,8 +57,8 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_14: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_14: @@ -77,8 +79,9 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v32i8_1: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v32i8_1: @@ -100,8 +103,9 @@ define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v32i8_17: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v32i8_17: