Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -5537,6 +5537,19 @@ let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; +// PINSRB copies only the low byte of the 32-bit scalar operand, so remove an +// unnecessary extension of that operand. +let Predicates = [UseSSE41] in { + def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext GR8:$src2)), imm:$src3), + (PINSRBrr VR128:$src1, (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src2, sub_8bit)), imm:$src3)>; +} +let Predicates = [HasAVX, NoBWI] in { + def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext GR8:$src2)), imm:$src3), + (VPINSRBrr VR128:$src1, (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src2, sub_8bit)), imm:$src3)>; +} + multiclass SS41I_insert32 opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8 @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind { ; X86-LABEL: test_mm256_insert_epi8: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_insert_epi8: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 +; X64-NEXT: vpinsrb $4, %edi, %xmm0, %xmm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X64-NEXT: retq %arg0 = bitcast <4 x i64> %a0 to <32 x i8> @@ -1407,137 +1406,131 @@ ; X86-LABEL: test_mm256_set_epi8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovd %ecx, %xmm1 +; X86-NEXT: vmovd %eax, %xmm1 +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_set_epi8: ; X64: # %bb.0: -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %r9b, %eax -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; X64-NEXT: vmovd %ecx, %xmm1 +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq @@ -2023,137 +2016,132 @@ ; X86-LABEL: test_mm256_setr_epi8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: vmovd %eax, %xmm0 +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovd %ecx, %xmm1 +; X86-NEXT: vmovd %eax, %xmm1 +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_setr_epi8: ; X64: # %bb.0: -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movzbl %dil, %esi -; X64-NEXT: vmovd %esi, %xmm1 -; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl %r9b, %eax -; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 +; X64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 +; X64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 +; X64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3103,37 +3103,37 @@ ; ; X86-AVX1-LABEL: test_mm_set_epi8: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x3c] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x40] -; X86-AVX1-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1] +; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X86-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x3c] ; X86-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x38] ; X86-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x34] ; X86-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x30] ; X86-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x2c] ; X86-AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x28] ; X86-AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x24] ; X86-AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x20] ; X86-AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x1c] ; X86-AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x18] ; X86-AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x14] ; X86-AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x10] ; X86-AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x0c] ; X86-AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08] ; X86-AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] ; X86-AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -3241,38 +3241,32 @@ ; ; X64-AVX1-LABEL: test_mm_set_epi8: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] ; X64-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x48] +; X64-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x40] ; X64-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x38] ; X64-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x30] ; X64-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x28] ; X64-AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x20] ; X64-AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x18] ; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x10] ; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x08] ; X64-AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09] -; X64-AVX1-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] -; X64-AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] -; X64-AVX1-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] -; X64-AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b] -; X64-AVX1-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] -; X64-AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] -; X64-AVX1-NEXT: movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2] -; X64-AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d] -; X64-AVX1-NEXT: movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6] -; X64-AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; X64-AVX1-NEXT: movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7] -; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f] +; X64-AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc1,0x0a] +; X64-AVX1-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc0,0x0b] +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x0c] +; X64-AVX1-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x0d] +; X64-AVX1-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc6,0x0e] +; X64-AVX1-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x0f] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_set_epi8: @@ -4113,37 +4107,37 @@ ; ; X86-AVX1-LABEL: test_mm_setr_epi8: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x04] -; X86-AVX1-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1] +; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08] ; X86-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x0c] ; X86-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x10] ; X86-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x14] ; X86-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x18] ; X86-AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x1c] ; X86-AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x20] ; X86-AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x24] ; X86-AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x28] ; X86-AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x2c] ; X86-AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x30] ; X86-AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x34] ; X86-AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x38] ; X86-AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x3c] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x3c] ; X86-AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; X86-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x40] ; X86-AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -4251,37 +4245,32 @@ ; ; X64-AVX1-LABEL: test_mm_setr_epi8: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6] -; X64-AVX1-NEXT: movzbl %dil, %esi # encoding: [0x40,0x0f,0xb6,0xf7] -; X64-AVX1-NEXT: vmovd %esi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc6] -; X64-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; X64-AVX1-NEXT: movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2] -; X64-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-AVX1-NEXT: movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1] -; X64-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] -; X64-AVX1-NEXT: movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0] -; X64-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-AVX1-NEXT: movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1] -; X64-AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X64-AVX1-NEXT: movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7] +; X64-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] +; X64-AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc6,0x01] +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x02] +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x03] +; X64-AVX1-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc0,0x04] +; X64-AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc1,0x05] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x08] ; X64-AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x10] ; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x18] ; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x20] ; X64-AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x28] ; X64-AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x30] ; X64-AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x38] ; X64-AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x40] ; X64-AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x48] ; X64-AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] +; X64-AVX1-NEXT: movb {{[0-9]+}}(%rsp), %al # encoding: [0x8a,0x44,0x24,0x50] ; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; Index: llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -556,27 +556,37 @@ define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) { ; X86-SSE-LABEL: test_mm_insert_epi8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-SSE-NEXT: pinsrb $1, %eax, %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test_mm_insert_epi8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_mm_insert_epi8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_insert_epi8: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test_mm_insert_epi8: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movzbl %dil, %eax -; X64-SSE-NEXT: pinsrb $1, %eax, %xmm0 +; X64-SSE-NEXT: pinsrb $1, %edi, %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test_mm_insert_epi8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movzbl %dil, %eax -; X64-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_mm_insert_epi8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_insert_epi8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: movzbl %dil, %eax +; X64-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; X64-AVX512-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1 %bc = bitcast <16 x i8> %res to <2 x i64> Index: llvm/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_saddo.ll +++ llvm/test/CodeGen/X86/vec_saddo.ll @@ -1729,37 +1729,36 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: adcq %r11, %rax +; SSE41-NEXT: movq %rcx, %rbp +; SSE41-NEXT: adcq %r11, %rbp ; SSE41-NEXT: setns %bl ; SSE41-NEXT: testq %rcx, %rcx ; SSE41-NEXT: setns %cl ; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %bpl +; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %r11, %r11 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: cmpb %bl, %cl ; SSE41-NEXT: sete %cl -; SSE41-NEXT: andb %bpl, %cl -; SSE41-NEXT: movzbl %cl, %ebp +; SSE41-NEXT: andb %al, %cl ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setns %cl -; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setns %al +; SSE41-NEXT: cmpb %bl, %al ; SSE41-NEXT: sete %r11b ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi ; SSE41-NEXT: setns %bl -; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %r11b, %cl -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT: cmpb %bl, %al +; SSE41-NEXT: setne %al +; SSE41-NEXT: andb %r11b, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 ; SSE41-NEXT: movq %rdx, 16(%r10) ; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rax, 24(%r10) +; SSE41-NEXT: movq %rbp, 24(%r10) ; SSE41-NEXT: movq %rsi, 8(%r10) ; SSE41-NEXT: psllq $63, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -1775,37 +1774,36 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: adcq %r11, %rax +; AVX1-NEXT: movq %rcx, %rbp +; AVX1-NEXT: adcq %r11, %rbp ; AVX1-NEXT: setns %bl ; AVX1-NEXT: testq %rcx, %rcx ; AVX1-NEXT: setns %cl ; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %bpl +; AVX1-NEXT: setne %al ; AVX1-NEXT: testq %r11, %r11 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: cmpb %bl, %cl ; AVX1-NEXT: sete %cl -; AVX1-NEXT: andb %bpl, %cl -; AVX1-NEXT: movzbl %cl, %ebp +; AVX1-NEXT: andb %al, %cl ; AVX1-NEXT: testq %r9, %r9 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setns %al +; AVX1-NEXT: cmpb %bl, %al ; AVX1-NEXT: sete %r11b ; AVX1-NEXT: addq %r8, %rdi ; AVX1-NEXT: adcq %r9, %rsi ; AVX1-NEXT: setns %bl -; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %r11b, %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: cmpb %bl, %al +; AVX1-NEXT: setne %al +; AVX1-NEXT: andb %r11b, %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rdx, 16(%r10) ; AVX1-NEXT: movq %rdi, (%r10) -; AVX1-NEXT: movq %rax, 24(%r10) +; AVX1-NEXT: movq %rbp, 24(%r10) ; AVX1-NEXT: movq %rsi, 8(%r10) ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1821,37 +1819,36 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: adcq %r11, %rax +; AVX2-NEXT: movq %rcx, %rbp +; AVX2-NEXT: adcq %r11, %rbp ; AVX2-NEXT: setns %bl ; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: setns %cl ; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %bpl +; AVX2-NEXT: setne %al ; AVX2-NEXT: testq %r11, %r11 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: cmpb %bl, %cl ; AVX2-NEXT: sete %cl -; AVX2-NEXT: andb %bpl, %cl -; AVX2-NEXT: movzbl %cl, %ebp +; AVX2-NEXT: andb %al, %cl ; AVX2-NEXT: testq %r9, %r9 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setns %al +; AVX2-NEXT: cmpb %bl, %al ; AVX2-NEXT: sete %r11b ; AVX2-NEXT: addq %r8, %rdi ; AVX2-NEXT: adcq %r9, %rsi ; AVX2-NEXT: setns %bl -; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %r11b, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: cmpb %bl, %al +; AVX2-NEXT: setne %al +; AVX2-NEXT: andb %r11b, %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rdx, 16(%r10) ; AVX2-NEXT: movq %rdi, (%r10) -; AVX2-NEXT: movq %rax, 24(%r10) +; AVX2-NEXT: movq %rbp, 24(%r10) ; AVX2-NEXT: movq %rsi, 8(%r10) ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 Index: llvm/test/CodeGen/X86/vec_smulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_smulo.ll +++ llvm/test/CodeGen/X86/vec_smulo.ll @@ -2572,7 +2572,6 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: movq %r12, %rcx ; SSE41-NEXT: callq __muloti4 -; SSE41-NEXT: xorl %ecx, %ecx ; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; SSE41-NEXT: setne %cl ; SSE41-NEXT: xorl %esi, %esi @@ -2624,7 +2623,6 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: movq %r12, %rcx ; AVX1-NEXT: callq __muloti4 -; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: setne %cl ; AVX1-NEXT: xorl %esi, %esi @@ -2676,7 +2674,6 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: movq %r12, %rcx ; AVX2-NEXT: callq __muloti4 -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: setne %cl ; AVX2-NEXT: xorl %esi, %esi Index: llvm/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ssubo.ll +++ llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1768,37 +1768,36 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: sbbq %r11, %rax +; SSE41-NEXT: movq %rcx, %rbp +; SSE41-NEXT: sbbq %r11, %rbp ; SSE41-NEXT: setns %bl ; SSE41-NEXT: testq %rcx, %rcx ; SSE41-NEXT: setns %cl ; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %bpl +; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %r11, %r11 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: cmpb %bl, %cl ; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %bpl, %cl -; SSE41-NEXT: movzbl %cl, %ebp +; SSE41-NEXT: andb %al, %cl ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setns %bl ; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setns %cl -; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setns %al +; SSE41-NEXT: cmpb %bl, %al ; SSE41-NEXT: setne %r11b ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi ; SSE41-NEXT: setns %bl -; SSE41-NEXT: cmpb %bl, %cl -; SSE41-NEXT: setne %cl -; SSE41-NEXT: andb %r11b, %cl -; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT: cmpb %bl, %al +; SSE41-NEXT: setne %al +; SSE41-NEXT: andb %r11b, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 ; SSE41-NEXT: movq %rdx, 16(%r10) ; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rax, 24(%r10) +; SSE41-NEXT: movq %rbp, 24(%r10) ; SSE41-NEXT: movq %rsi, 8(%r10) ; SSE41-NEXT: psllq $63, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -1814,37 +1813,36 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: sbbq %r11, %rax +; AVX1-NEXT: movq %rcx, %rbp +; AVX1-NEXT: sbbq %r11, %rbp ; AVX1-NEXT: setns %bl ; AVX1-NEXT: testq %rcx, %rcx ; AVX1-NEXT: setns %cl ; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %bpl +; AVX1-NEXT: setne %al ; AVX1-NEXT: testq %r11, %r11 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: cmpb %bl, %cl ; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %bpl, %cl -; AVX1-NEXT: movzbl %cl, %ebp +; AVX1-NEXT: andb %al, %cl ; AVX1-NEXT: testq %r9, %r9 ; AVX1-NEXT: setns %bl ; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setns %cl -; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setns %al +; AVX1-NEXT: cmpb %bl, %al ; AVX1-NEXT: setne %r11b ; AVX1-NEXT: subq %r8, %rdi ; AVX1-NEXT: sbbq %r9, %rsi ; AVX1-NEXT: setns %bl -; AVX1-NEXT: cmpb %bl, %cl -; AVX1-NEXT: setne %cl -; AVX1-NEXT: andb %r11b, %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: cmpb %bl, %al +; AVX1-NEXT: setne %al +; AVX1-NEXT: andb %r11b, %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rdx, 16(%r10) ; AVX1-NEXT: movq %rdi, (%r10) -; AVX1-NEXT: movq %rax, 24(%r10) +; AVX1-NEXT: movq %rbp, 24(%r10) ; AVX1-NEXT: movq %rsi, 8(%r10) ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1860,37 +1858,36 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: sbbq %r11, %rax +; AVX2-NEXT: movq %rcx, %rbp +; AVX2-NEXT: sbbq %r11, %rbp ; AVX2-NEXT: setns %bl ; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: setns %cl ; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %bpl +; AVX2-NEXT: setne %al ; AVX2-NEXT: testq %r11, %r11 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: cmpb %bl, %cl ; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %bpl, %cl -; AVX2-NEXT: movzbl %cl, %ebp +; AVX2-NEXT: andb %al, %cl ; AVX2-NEXT: testq %r9, %r9 ; AVX2-NEXT: setns %bl ; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setns %cl -; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setns %al +; AVX2-NEXT: cmpb %bl, %al ; AVX2-NEXT: setne %r11b ; AVX2-NEXT: subq %r8, %rdi ; AVX2-NEXT: sbbq %r9, %rsi ; AVX2-NEXT: setns %bl -; AVX2-NEXT: cmpb %bl, %cl -; AVX2-NEXT: setne %cl -; AVX2-NEXT: andb %r11b, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: cmpb %bl, %al +; AVX2-NEXT: setne %al +; AVX2-NEXT: andb %r11b, %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rdx, 16(%r10) ; AVX2-NEXT: movq %rdi, (%r10) -; AVX2-NEXT: movq %rax, 24(%r10) +; AVX2-NEXT: movq %rbp, 24(%r10) ; AVX2-NEXT: movq %rsi, 8(%r10) ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 Index: llvm/test/CodeGen/X86/vec_uaddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_uaddo.ll +++ llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1270,8 +1270,7 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: setb %r11b ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi ; SSE41-NEXT: setb %al @@ -1292,8 +1291,7 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: setb %r11b ; AVX1-NEXT: addq %r8, %rdi ; AVX1-NEXT: adcq %r9, %rsi ; AVX1-NEXT: setb %al @@ -1314,8 +1312,7 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: setb %r11b ; AVX2-NEXT: addq %r8, %rdi ; AVX2-NEXT: adcq %r9, %rsi ; AVX2-NEXT: setb %al Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -2327,10 +2327,11 @@ ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq %r8, %r14 ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: movq %rdx, %r12 +; SSE41-NEXT: movq %rsi, %r8 ; SSE41-NEXT: movq %rdi, %r11 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: testq %r10, %r10 @@ -2352,38 +2353,38 @@ ; SSE41-NEXT: movq %rax, %r10 ; SSE41-NEXT: movq %rdx, %r15 ; SSE41-NEXT: addq %rbx, %r15 -; SSE41-NEXT: setb %al -; SSE41-NEXT: orb %cl, %al -; SSE41-NEXT: orb %r13b, %al -; SSE41-NEXT: movzbl %al, %ebp +; SSE41-NEXT: setb %sil +; SSE41-NEXT: orb %cl, %sil +; SSE41-NEXT: orb %r13b, %sil ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al -; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setne %r13b -; SSE41-NEXT: andb %al, %r13b -; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: seto %r12b +; SSE41-NEXT: testq %r8, %r8 +; SSE41-NEXT: setne %r12b +; SSE41-NEXT: andb %al, %r12b +; SSE41-NEXT: movq %r8, %rax +; SSE41-NEXT: mulq %r14 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: seto %bpl ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %r11 ; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: seto %bl -; SSE41-NEXT: orb %r12b, %bl -; SSE41-NEXT: addq %rsi, %rdi +; SSE41-NEXT: seto %cl +; SSE41-NEXT: orb %bpl, %cl +; SSE41-NEXT: addq %rbx, %rdi ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: addq %rdi, %rdx -; SSE41-NEXT: setb %cl -; SSE41-NEXT: orb %bl, %cl -; SSE41-NEXT: orb %r13b, %cl -; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: setb %bl +; SSE41-NEXT: orb %cl, %bl +; SSE41-NEXT: orb %r12b, %bl +; SSE41-NEXT: movzbl %bl, %ecx ; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 -; SSE41-NEXT: movq %r10, 16(%r14) -; SSE41-NEXT: movq %rax, (%r14) -; SSE41-NEXT: movq %r15, 24(%r14) -; SSE41-NEXT: movq %rdx, 8(%r14) +; SSE41-NEXT: pinsrb $8, %esi, %xmm0 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movq %r10, 16(%rcx) +; SSE41-NEXT: movq %rax, (%rcx) +; SSE41-NEXT: movq %r15, 24(%rcx) +; SSE41-NEXT: movq %rdx, 8(%rcx) ; SSE41-NEXT: psllq $63, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -2403,10 +2404,11 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movq %r8, %r14 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rsi, %r8 ; AVX1-NEXT: movq %rdi, %r11 -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: testq %r10, %r10 @@ -2428,38 +2430,38 @@ ; AVX1-NEXT: movq %rax, %r10 ; AVX1-NEXT: movq %rdx, %r15 ; AVX1-NEXT: addq %rbx, %r15 -; AVX1-NEXT: setb %al -; AVX1-NEXT: orb %cl, %al -; AVX1-NEXT: orb %r13b, %al -; AVX1-NEXT: movzbl %al, %ebp +; AVX1-NEXT: setb %sil +; AVX1-NEXT: orb %cl, %sil +; AVX1-NEXT: orb %r13b, %sil ; AVX1-NEXT: testq %r9, %r9 ; AVX1-NEXT: setne %al -; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setne %r13b -; AVX1-NEXT: andb %al, %r13b -; AVX1-NEXT: movq %rsi, %rax -; AVX1-NEXT: mulq %r8 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: seto %r12b +; AVX1-NEXT: testq %r8, %r8 +; AVX1-NEXT: setne %r12b +; AVX1-NEXT: andb %al, %r12b +; AVX1-NEXT: movq %r8, %rax +; AVX1-NEXT: mulq %r14 +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: seto %bpl ; AVX1-NEXT: movq %r9, %rax ; AVX1-NEXT: mulq %r11 ; AVX1-NEXT: movq %rax, %rdi ; AVX1-NEXT: seto %cl -; AVX1-NEXT: orb %r12b, %cl -; AVX1-NEXT: addq %rsi, %rdi +; AVX1-NEXT: orb %bpl, %cl +; AVX1-NEXT: addq %rbx, %rdi ; AVX1-NEXT: movq %r11, %rax -; AVX1-NEXT: mulq %r8 +; AVX1-NEXT: mulq %r14 ; AVX1-NEXT: addq %rdi, %rdx ; AVX1-NEXT: setb %bl ; AVX1-NEXT: orb %cl, %bl -; AVX1-NEXT: orb %r13b, %bl +; AVX1-NEXT: orb %r12b, %bl ; AVX1-NEXT: movzbl %bl, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %r10, 16(%r14) -; AVX1-NEXT: movq %rax, (%r14) -; AVX1-NEXT: movq %r15, 24(%r14) -; AVX1-NEXT: movq %rdx, 8(%r14) +; AVX1-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: movq %r10, 16(%rcx) +; AVX1-NEXT: movq %rax, (%rcx) +; AVX1-NEXT: movq %r15, 24(%rcx) +; AVX1-NEXT: movq %rdx, 8(%rcx) ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -2479,10 +2481,11 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq %r8, %r14 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rsi, %r8 ; AVX2-NEXT: movq %rdi, %r11 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: testq %r10, %r10 @@ -2504,38 +2507,38 @@ ; AVX2-NEXT: movq %rax, %r10 ; AVX2-NEXT: movq %rdx, %r15 ; AVX2-NEXT: addq %rbx, %r15 -; AVX2-NEXT: setb %al -; AVX2-NEXT: orb %cl, %al -; AVX2-NEXT: orb %r13b, %al -; AVX2-NEXT: movzbl %al, %ebp +; AVX2-NEXT: setb %sil +; AVX2-NEXT: orb %cl, %sil +; AVX2-NEXT: orb %r13b, %sil ; AVX2-NEXT: testq %r9, %r9 ; AVX2-NEXT: setne %al -; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setne %r13b -; AVX2-NEXT: andb %al, %r13b -; AVX2-NEXT: movq %rsi, %rax -; AVX2-NEXT: mulq %r8 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: seto %r12b +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: setne %r12b +; AVX2-NEXT: andb %al, %r12b +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: mulq %r14 +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: seto %bpl ; AVX2-NEXT: movq %r9, %rax ; AVX2-NEXT: mulq %r11 ; AVX2-NEXT: movq %rax, %rdi ; AVX2-NEXT: seto %cl -; AVX2-NEXT: orb %r12b, %cl -; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: orb %bpl, %cl +; AVX2-NEXT: addq %rbx, %rdi ; AVX2-NEXT: movq %r11, %rax -; AVX2-NEXT: mulq %r8 +; AVX2-NEXT: mulq %r14 ; AVX2-NEXT: addq %rdi, %rdx ; AVX2-NEXT: setb %bl ; AVX2-NEXT: orb %cl, %bl -; AVX2-NEXT: orb %r13b, %bl +; AVX2-NEXT: orb %r12b, %bl ; AVX2-NEXT: movzbl %bl, %ecx ; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %r10, 16(%r14) -; AVX2-NEXT: movq %rax, (%r14) -; AVX2-NEXT: movq %r15, 24(%r14) -; AVX2-NEXT: movq %rdx, 8(%r14) +; AVX2-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq %r10, 16(%rcx) +; AVX2-NEXT: movq %rax, (%rcx) +; AVX2-NEXT: movq %r15, 24(%rcx) +; AVX2-NEXT: movq %rdx, 8(%rcx) ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 Index: llvm/test/CodeGen/X86/vec_usubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_usubo.ll +++ llvm/test/CodeGen/X86/vec_usubo.ll @@ -1311,8 +1311,7 @@ ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: setb %r11b ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi ; SSE41-NEXT: setb %al @@ -1333,8 +1332,7 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: setb %al -; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: setb %r11b ; AVX1-NEXT: subq %r8, %rdi ; AVX1-NEXT: sbbq %r9, %rsi ; AVX1-NEXT: setb %al @@ -1355,8 +1353,7 @@ ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: setb %al -; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: setb %r11b ; AVX2-NEXT: subq %r8, %rdi ; AVX2-NEXT: sbbq %r9, %rsi ; AVX2-NEXT: setb %al