Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -13935,7 +13935,33 @@ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: handle v16i8. + // TODO: We only extract a single element from v16i8, we can probably afford + // to be more aggressive here before using the default approach of spilling to + // stack. + if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { + // Extract either the lowest i32 or any i16, and extract the sub-byte. + int DWordIdx = IdxVal / 4; + if (DWordIdx == 0) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + DAG.getIntPtrConstant(DWordIdx, dl)); + int ShiftVal = (IdxVal % 4) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, + DAG.getConstant(ShiftVal, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + int WordIdx = IdxVal / 2; + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) Index: llvm/trunk/test/CodeGen/X86/extract-store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/extract-store.ll +++ llvm/trunk/test/CodeGen/X86/extract-store.ll @@ -9,22 +9,14 @@ define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind { ; SSE2-X32-LABEL: extract_i8_0: ; SSE2-X32: # BB#0: -; SSE2-X32-NEXT: pushl %ebp -; SSE2-X32-NEXT: movl %esp, %ebp -; SSE2-X32-NEXT: andl $-16, %esp -; SSE2-X32-NEXT: subl $32, %esp -; SSE2-X32-NEXT: movl 8(%ebp), %eax -; SSE2-X32-NEXT: movaps %xmm0, (%esp) -; SSE2-X32-NEXT: movb (%esp), %cl +; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2-X32-NEXT: movd %xmm0, %ecx ; SSE2-X32-NEXT: movb %cl, (%eax) -; SSE2-X32-NEXT: movl %ebp, %esp -; SSE2-X32-NEXT: popl %ebp ; SSE2-X32-NEXT: retl ; ; SSE2-X64-LABEL: extract_i8_0: ; SSE2-X64: # BB#0: -; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-X64-NEXT: movd %xmm0, %eax ; SSE2-X64-NEXT: movb %al, (%rdi) ; SSE2-X64-NEXT: retq ; @@ -57,22 +49,16 @@ define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind { ; SSE2-X32-LABEL: extract_i8_3: ; SSE2-X32: # BB#0: -; SSE2-X32-NEXT: pushl %ebp -; SSE2-X32-NEXT: movl %esp, %ebp -; SSE2-X32-NEXT: andl $-16, %esp -; SSE2-X32-NEXT: subl $32, %esp -; SSE2-X32-NEXT: movl 8(%ebp), %eax -; SSE2-X32-NEXT: movaps %xmm0, (%esp) -; SSE2-X32-NEXT: movb {{[0-9]+}}(%esp), %cl +; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2-X32-NEXT: movd %xmm0, %ecx +; SSE2-X32-NEXT: shrl $24, %ecx ; SSE2-X32-NEXT: movb %cl, (%eax) -; SSE2-X32-NEXT: movl %ebp, %esp -; SSE2-X32-NEXT: popl %ebp ; SSE2-X32-NEXT: retl ; ; SSE2-X64-LABEL: extract_i8_3: ; SSE2-X64: # BB#0: -; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-X64-NEXT: movd %xmm0, %eax +; SSE2-X64-NEXT: shrl $24, %eax ; SSE2-X64-NEXT: movb %al, (%rdi) ; SSE2-X64-NEXT: retq ; @@ -105,23 +91,15 @@ define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind { ; SSE2-X32-LABEL: extract_i8_15: ; SSE2-X32: # BB#0: -; SSE2-X32-NEXT: pushl %ebp -; SSE2-X32-NEXT: movl %esp, %ebp -; SSE2-X32-NEXT: andl $-16, %esp -; SSE2-X32-NEXT: subl $32, %esp -; SSE2-X32-NEXT: movl 8(%ebp), %eax -; SSE2-X32-NEXT: movaps %xmm0, (%esp) -; SSE2-X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; SSE2-X32-NEXT: movb %cl, (%eax) -; SSE2-X32-NEXT: movl %ebp, %esp -; SSE2-X32-NEXT: popl %ebp +; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2-X32-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-X32-NEXT: movb %ch, (%eax) ; SSE2-X32-NEXT: retl ; ; SSE2-X64-LABEL: extract_i8_15: ; SSE2-X64: # BB#0: -; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-X64-NEXT: movb %al, (%rdi) +; SSE2-X64-NEXT: pextrw $7, %xmm0, %eax +; SSE2-X64-NEXT: movb %ah, (%rdi) # NOREX ; SSE2-X64-NEXT: retq ; ; SSE41-X32-LABEL: extract_i8_15: Index: llvm/trunk/test/CodeGen/X86/extractelement-index.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/extractelement-index.ll +++ llvm/trunk/test/CodeGen/X86/extractelement-index.ll @@ -11,8 +11,9 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_1: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_1: @@ -33,8 +34,9 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_11: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_11: @@ -55,8 +57,8 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v16i8_14: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v16i8_14: @@ -77,8 +79,9 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v32i8_1: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v32i8_1: @@ -100,8 +103,9 @@ define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind { ; SSE2-LABEL: extractelement_v32i8_17: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v32i8_17: