Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -38923,6 +38923,21 @@ return DCI.CombineTo(N, NewVec, TF, true); } + // Bool vector load - attempt to cast to an integer, as we have good + // (vXiY *ext(vXi1 bitcast(iX))) handling. + if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && + RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { + unsigned NumElts = RegVT.getVectorNumElements(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + if (TLI.isTypeLegal(IntVT)) { + SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Alignment, + Ld->getMemOperand()->getFlags()); + SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); + return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); + } + } + return SDValue(); } Index: llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll +++ llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll @@ -2026,193 +2026,32 @@ } define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $7, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $57, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $58, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $59, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $60, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $61, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $62, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: shlq $63, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i1_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shrq $7, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $57, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $58, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $59, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $60, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $61, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $62, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: shlq $63, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movsbq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_8i1_to_8i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_8i1_to_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movsbq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $60, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrl $7, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_8i1_to_8i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movsbq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $60, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrl $7, %eax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_8i1_to_8i16: @@ -2232,84 +2071,16 @@ ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_8i1_to_8i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movsbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $25, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $26, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $27, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $28, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: shll $31, %eax -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i1_to_8i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movsbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm0 -; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; X32-SSE41-NEXT: shrl $7, %eax -; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm0 -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_8i1_to_8i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; X32-SSE-NEXT: retl entry: %X = load <8 x i1>, <8 x i1>* %ptr %Y = sext <8 x i1> %X to <8 x i16> @@ -2466,322 +2237,81 @@ } define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i32: +; SSE-LABEL: load_sext_8i1_to_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i1_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; X32-SSE-LABEL: load_sext_8i1_to_8i32: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; X32-SSE-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i32: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: load_sext_8i1_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $31, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_8i1_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movsbq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $59, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $7, %rcx -; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i1_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movsbq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $59, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $7, %rcx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rax -; AVX2-NEXT: sarq $63, %rax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i1_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i1_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax -; X32-SSE2-NEXT: movd %eax, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: pslld $31, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: pslld $31, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i1_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $7, %eax -; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm1 -; X32-SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X32-SSE41-NEXT: pslld $31, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE41-NEXT: pslld $31, %xmm1 -; X32-SSE41-NEXT: psrad $31, %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i1>, <8 x i1>* %ptr - %Y = sext <8 x i1> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { -; SSE2-LABEL: load_sext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i8_to_8i32: +; SSSE3-LABEL: load_sext_8i8_to_8i32: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2840,388 +2370,50 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE2-LABEL: load_sext_16i1_to_16i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: shrq $15, %rbp -; SSE2-NEXT: movd %ebp, %xmm0 -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: shlq $49, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: shlq $50, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm2 -; SSE2-NEXT: shlq $51, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm3 -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: shlq $61, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: shlq $62, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: shlq $63, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: shlq $58, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: shlq $59, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: shlq $57, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: shrq $7, %rax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_16i1_to_16i8: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: shrq $15, %rbp -; SSSE3-NEXT: movd %ebp, %xmm0 -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: shlq $49, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: shlq $50, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm2 -; SSSE3-NEXT: shlq $51, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm3 -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: shlq $61, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: shlq $62, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: shlq $63, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: shlq $58, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: shlq $59, %rdi -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movd %edi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: shlq $57, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: shrq $7, %rax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_16i1_to_16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movswq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_16i1_to_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movswq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $60, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movsbq %al, %rcx -; AVX1-NEXT: shrl $7, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $55, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $54, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $53, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $52, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $51, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $50, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $49, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrl $15, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_16i1_to_16i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movswq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $60, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movsbq %al, %rcx -; AVX2-NEXT: shrl $7, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $55, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $54, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $53, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $52, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $51, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $50, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $49, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrl $15, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_16i1_to_16i8: @@ -3242,160 +2434,24 @@ ; ; X32-SSE2-LABEL: load_sext_16i1_to_16i8: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx -; X32-SSE2-NEXT: pushl %edi -; X32-SSE2-NEXT: pushl %esi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movswl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shrl $15, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm2 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shll $17, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: shll $18, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm1 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: shll $19, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $20, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: shll $21, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm6 -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: shll $22, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shll $23, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm5 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-SSE2-NEXT: shll $28, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm2 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movsbl %al, %eax -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; X32-SSE2-NEXT: shll $30, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; X32-SSE2-NEXT: shll $31, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X32-SSE2-NEXT: shll $26, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: shll $27, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X32-SSE2-NEXT: shll $25, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm1 -; X32-SSE2-NEXT: shrl $7, %eax -; X32-SSE2-NEXT: movd %eax, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; X32-SSE2-NEXT: popl %esi -; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movswl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm0 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; X32-SSE41-NEXT: movsbl %al, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $23, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $22, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $21, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $20, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $19, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $18, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $17, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE41-NEXT: pand %xmm1, %xmm0 +; X32-SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE41-NEXT: retl entry: %X = load <16 x i1>, <16 x i1>* %ptr @@ -3404,442 +2460,43 @@ } define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { -; SSE2-LABEL: load_sext_16i1_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $14, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $13, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $12, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $11, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $10, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $9, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psllw $15, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_16i1_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $14, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $13, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $12, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $11, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $10, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $9, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $8, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psllw $15, %xmm0 -; SSSE3-NEXT: psraw $15, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psllw $15, %xmm1 -; SSSE3-NEXT: psraw $15, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_16i1_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $9, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $10, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $11, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $12, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $13, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $14, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psllw $15, %xmm1 -; SSE41-NEXT: psraw $15, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_16i1_to_16i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_16i1_to_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: .cfi_def_cfa_offset 24 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: .cfi_def_cfa_offset 32 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: .cfi_def_cfa_offset 40 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: .cfi_def_cfa_offset 56 -; AVX1-NEXT: .cfi_offset %rbx, -56 -; AVX1-NEXT: .cfi_offset %r12, -48 -; AVX1-NEXT: .cfi_offset %r13, -40 -; AVX1-NEXT: .cfi_offset %r14, -32 -; AVX1-NEXT: .cfi_offset %r15, -24 -; AVX1-NEXT: .cfi_offset %rbp, -16 -; AVX1-NEXT: movswq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $55, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rax, %r11 -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: movsbq %al, %rbp -; AVX1-NEXT: shlq $54, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shlq $53, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $52, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $51, %r11 -; AVX1-NEXT: sarq $63, %r11 -; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $50, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $49, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: shrl $15, %r9d -; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $63, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vmovd %r13d, %xmm1 -; AVX1-NEXT: shlq $62, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $61, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $58, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $57, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shrl $7, %ebp -; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: .cfi_def_cfa_offset 40 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: .cfi_def_cfa_offset 32 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: .cfi_def_cfa_offset 24 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_16i1_to_16i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: .cfi_def_cfa_offset 24 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: .cfi_def_cfa_offset 32 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: .cfi_def_cfa_offset 40 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 56 -; AVX2-NEXT: .cfi_offset %rbx, -56 -; AVX2-NEXT: .cfi_offset %r12, -48 -; AVX2-NEXT: .cfi_offset %r13, -40 -; AVX2-NEXT: .cfi_offset %r14, -32 -; AVX2-NEXT: .cfi_offset %r15, -24 -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movswq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $55, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: movsbq %al, %rbp -; AVX2-NEXT: shlq $54, %rax -; AVX2-NEXT: sarq $63, %rax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: shlq $53, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $52, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $51, %r11 -; AVX2-NEXT: sarq $63, %r11 -; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $50, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $49, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrl $15, %r9d -; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $63, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vmovd %r13d, %xmm1 -; AVX2-NEXT: shlq $62, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $61, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $58, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $57, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shrl $7, %ebp -; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: .cfi_def_cfa_offset 40 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: .cfi_def_cfa_offset 32 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: .cfi_def_cfa_offset 24 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_16i1_to_16i16: @@ -3856,165 +2513,20 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_16i1_to_16i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $15, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $14, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $13, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $12, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $11, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $10, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $9, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $8, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax -; X32-SSE2-NEXT: movd %eax, %xmm4 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psllw $15, %xmm0 -; X32-SSE2-NEXT: psraw $15, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: psllw $15, %xmm1 -; X32-SSE2-NEXT: psraw $15, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_16i1_to_16i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzwl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $8, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $9, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $10, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $11, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $12, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $13, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $14, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; X32-SSE41-NEXT: psllw $15, %xmm0 -; X32-SSE41-NEXT: psraw $15, %xmm0 -; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE41-NEXT: psllw $15, %xmm1 -; X32-SSE41-NEXT: psraw $15, %xmm1 -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_16i1_to_16i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; X32-SSE-NEXT: retl entry: %X = load <16 x i1>, <16 x i1>* %ptr %Y = sext <16 x i1> %X to <16 x i16> @@ -4022,762 +2534,49 @@ } define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { -; SSE2-LABEL: load_sext_32i1_to_32i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shrq $15, %rbx -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shlq $49, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm15 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: shlq $50, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm8 -; SSE2-NEXT: shlq $51, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm9 -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm6 -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm10 -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm2 -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm11 -; SSE2-NEXT: shlq $61, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: shlq $62, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: shlq $63, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shlq $58, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm13 -; SSE2-NEXT: shlq $59, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm7 -; SSE2-NEXT: shlq $57, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: shrq $7, %rax -; SSE2-NEXT: movd %eax, %xmm14 -; SSE2-NEXT: movswq 2(%rdi), %rsi -; SSE2-NEXT: movq %rsi, %r8 -; SSE2-NEXT: movq %rsi, %r9 -; SSE2-NEXT: movq %rsi, %r10 -; SSE2-NEXT: movq %rsi, %r11 -; SSE2-NEXT: movq %rsi, %r14 -; SSE2-NEXT: movq %rsi, %r15 -; SSE2-NEXT: movq %rsi, %r12 -; SSE2-NEXT: movq %rsi, %r13 -; SSE2-NEXT: movq %rsi, %rbx -; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: movq %rsi, %rcx -; SSE2-NEXT: movq %rsi, %rdx -; SSE2-NEXT: movq %rsi, %rdi -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: shrq $15, %rbp -; SSE2-NEXT: movd %ebp, %xmm1 -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: movsbq %sil, %rsi -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE2-NEXT: shlq $49, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: shlq $50, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: shlq $51, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: shlq $61, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: shlq $62, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: shlq $63, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: shlq $58, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: shlq $59, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: shlq $57, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: shrq $7, %rsi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_32i1_to_32i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: shrq $15, %rbx -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: shlq $49, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm15 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSSE3-NEXT: shlq $50, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm8 -; SSSE3-NEXT: shlq $51, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm9 -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm6 -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm10 -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm2 -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm11 -; SSSE3-NEXT: shlq $61, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: shlq $62, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: shlq $63, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shlq $58, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm13 -; SSSE3-NEXT: shlq $59, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm7 -; SSSE3-NEXT: shlq $57, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: shrq $7, %rax -; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movswq 2(%rdi), %rsi -; SSSE3-NEXT: movq %rsi, %r8 -; SSSE3-NEXT: movq %rsi, %r9 -; SSSE3-NEXT: movq %rsi, %r10 -; SSSE3-NEXT: movq %rsi, %r11 -; SSSE3-NEXT: movq %rsi, %r14 -; SSSE3-NEXT: movq %rsi, %r15 -; SSSE3-NEXT: movq %rsi, %r12 -; SSSE3-NEXT: movq %rsi, %r13 -; SSSE3-NEXT: movq %rsi, %rbx -; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: movq %rsi, %rcx -; SSSE3-NEXT: movq %rsi, %rdx -; SSSE3-NEXT: movq %rsi, %rdi -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: shrq $15, %rbp -; SSSE3-NEXT: movd %ebp, %xmm1 -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: movsbq %sil, %rsi -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSSE3-NEXT: shlq $49, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSSE3-NEXT: shlq $50, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: shlq $51, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: shlq $61, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: shlq $62, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: shlq $63, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: shlq $58, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: shlq $59, %rdi -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movd %edi, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: shlq $57, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: shrq $7, %rsi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_32i1_to_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movswq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movswq 2(%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_32i1_to_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_32i1_to_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movslq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $47, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rax, %r11 -; AVX1-NEXT: movq %rax, %r9 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $46, %rbp -; AVX1-NEXT: sarq $63, %rbp -; AVX1-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $45, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: shlq $44, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $43, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $42, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: shlq $41, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: shlq $40, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: shlq $39, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: shlq $38, %r11 -; AVX1-NEXT: sarq $63, %r11 -; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: movsbq %al, %r11 -; AVX1-NEXT: shlq $37, %r9 -; AVX1-NEXT: sarq $63, %r9 -; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r9 -; AVX1-NEXT: shlq $36, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: shlq $35, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: shlq $34, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: shlq $33, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: shrq $31, %rbp -; AVX1-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movswq %ax, %rax -; AVX1-NEXT: shlq $62, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $59, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $58, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $57, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX1-NEXT: shrl $7, %r11d -; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $55, %r9 -; AVX1-NEXT: sarq $63, %r9 -; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $54, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $53, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $52, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $51, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $50, %rbp -; AVX1-NEXT: sarq $63, %rbp -; AVX1-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 -; AVX1-NEXT: shlq $49, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shrl $15, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_32i1_to_32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movslq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $47, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $46, %rbp -; AVX2-NEXT: sarq $63, %rbp -; AVX2-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $45, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlq $44, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $43, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $42, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: shlq $41, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: shlq $40, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shlq $39, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: shlq $38, %r11 -; AVX2-NEXT: sarq $63, %r11 -; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: movsbq %al, %r11 -; AVX2-NEXT: shlq $37, %r9 -; AVX2-NEXT: sarq $63, %r9 -; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: shlq $36, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shlq $35, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: shlq $34, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: shlq $33, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shrq $31, %rbp -; AVX2-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movswq %ax, %rax -; AVX2-NEXT: shlq $62, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $59, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $58, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $57, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX2-NEXT: shrl $7, %r11d -; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $55, %r9 -; AVX2-NEXT: sarq $63, %r9 -; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $54, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $53, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $52, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $51, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $50, %rbp -; AVX2-NEXT: sarq $63, %rbp -; AVX2-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 -; AVX2-NEXT: shlq $49, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shrl $15, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_32i1_to_32i8: @@ -4798,309 +2597,21 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_32i1_to_32i8: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx -; X32-SSE2-NEXT: pushl %edi -; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: subl $28, %esp -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movswl (%eax), %edx -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shrl $15, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shll $17, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm4 -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: shll $18, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm1 -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: shll $19, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm2 -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: shll $20, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm5 -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: shll $21, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm6 -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shll $22, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: shll $23, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm3 -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; X32-SSE2-NEXT: shll $28, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm0 -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X32-SSE2-NEXT: shll $29, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm1 -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: movsbl %dl, %edx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: shll $31, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-SSE2-NEXT: shll $26, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm7 -; X32-SSE2-NEXT: shll $27, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm2 -; X32-SSE2-NEXT: shll $25, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm6 -; X32-SSE2-NEXT: shrl $7, %edx -; X32-SSE2-NEXT: movd %edx, %xmm5 -; X32-SSE2-NEXT: movswl 2(%eax), %eax -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shrl $15, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm4 -; X32-SSE2-NEXT: movdqu %xmm4, (%esp) # 16-byte Spill -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: shll $17, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm4 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; X32-SSE2-NEXT: shll $18, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X32-SSE2-NEXT: shll $19, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm5 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; X32-SSE2-NEXT: shll $20, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm6 -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: shll $21, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm1 -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; X32-SSE2-NEXT: shll $22, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm3 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: movdqu (%esp), %xmm2 # 16-byte Reload -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X32-SSE2-NEXT: shll $23, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm2 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; X32-SSE2-NEXT: shll $28, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movsbl %al, %eax -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE2-NEXT: shll $30, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X32-SSE2-NEXT: shll $31, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; X32-SSE2-NEXT: shll $26, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm5 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: shll $27, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; X32-SSE2-NEXT: shll $25, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm4 -; X32-SSE2-NEXT: shrl $7, %eax -; X32-SSE2-NEXT: movd %eax, %xmm5 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE2-NEXT: addl $28, %esp -; X32-SSE2-NEXT: popl %esi -; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_32i1_to_32i8: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pushl %esi -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movswl (%eax), %ecx -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $30, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movl %ecx, %esi -; X32-SSE41-NEXT: shll $31, %esi -; X32-SSE41-NEXT: sarl $31, %esi -; X32-SSE41-NEXT: movd %esi, %xmm0 -; X32-SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $29, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $2, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $28, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $3, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $27, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $4, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $26, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $5, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $25, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $6, %edx, %xmm0 -; X32-SSE41-NEXT: movsbl %cl, %edx -; X32-SSE41-NEXT: shrl $7, %edx -; X32-SSE41-NEXT: pinsrb $7, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $23, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $8, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $22, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $9, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $21, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $10, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $20, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $11, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $19, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $12, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $18, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $13, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $17, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $14, %edx, %xmm0 -; X32-SSE41-NEXT: shrl $15, %ecx -; X32-SSE41-NEXT: pinsrb $15, %ecx, %xmm0 -; X32-SSE41-NEXT: movswl 2(%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; X32-SSE41-NEXT: movsbl %al, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $23, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $22, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $21, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $20, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $19, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $18, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $17, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; X32-SSE41-NEXT: popl %esi -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_32i1_to_32i8: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: retl entry: %X = load <32 x i1>, <32 x i1>* %ptr %Y = sext <32 x i1> %X to <32 x i8> Index: llvm/trunk/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sext.ll +++ llvm/trunk/test/CodeGen/X86/vector-sext.ll @@ -2026,193 +2026,32 @@ } define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $7, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $57, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $58, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $59, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $60, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $61, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $62, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: shlq $63, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i1_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shrq $7, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $57, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $58, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $59, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $60, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $61, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $62, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: shlq $63, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movsbq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_8i1_to_8i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_8i1_to_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movsbq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $60, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrl $7, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_8i1_to_8i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movsbq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $60, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrl $7, %eax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_8i1_to_8i16: @@ -2232,84 +2071,16 @@ ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_8i1_to_8i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movsbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $25, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $26, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $27, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $28, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: shll $31, %eax -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i1_to_8i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movsbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm0 -; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; X32-SSE41-NEXT: shrl $7, %eax -; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm0 -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_8i1_to_8i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; X32-SSE-NEXT: retl entry: %X = load <8 x i1>, <8 x i1>* %ptr %Y = sext <8 x i1> %X to <8 x i16> @@ -2466,322 +2237,81 @@ } define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i32: +; SSE-LABEL: load_sext_8i1_to_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i1_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: kmovw (%rdi), %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; X32-SSE-LABEL: load_sext_8i1_to_8i32: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; X32-SSE-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i32: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: load_sext_8i1_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: pinsrw $7, %eax, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $31, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_8i1_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movsbq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $59, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $7, %rcx -; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i1_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movsbq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $59, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $7, %rcx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rax -; AVX2-NEXT: sarq $63, %rax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i1_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i1_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax -; X32-SSE2-NEXT: movd %eax, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: pslld $31, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: pslld $31, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i1_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $7, %eax -; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm1 -; X32-SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X32-SSE41-NEXT: pslld $31, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE41-NEXT: pslld $31, %xmm1 -; X32-SSE41-NEXT: psrad $31, %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i1>, <8 x i1>* %ptr - %Y = sext <8 x i1> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { -; SSE2-LABEL: load_sext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i8_to_8i32: +; SSSE3-LABEL: load_sext_8i8_to_8i32: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2840,388 +2370,50 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE2-LABEL: load_sext_16i1_to_16i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: shrq $15, %rbp -; SSE2-NEXT: movd %ebp, %xmm0 -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: shlq $49, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: shlq $50, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm2 -; SSE2-NEXT: shlq $51, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm3 -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: shlq $61, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: shlq $62, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: shlq $63, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: shlq $58, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: shlq $59, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: shlq $57, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: shrq $7, %rax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_16i1_to_16i8: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: shrq $15, %rbp -; SSSE3-NEXT: movd %ebp, %xmm0 -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: shlq $49, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: shlq $50, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm2 -; SSSE3-NEXT: shlq $51, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm3 -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: shlq $61, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: shlq $62, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: shlq $63, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: shlq $58, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: shlq $59, %rdi -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movd %edi, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: shlq $57, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: shrq $7, %rax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_16i1_to_16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movswq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_16i1_to_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movswq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $60, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $58, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $57, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movsbq %al, %rcx -; AVX1-NEXT: shrl $7, %ecx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $55, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $54, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $53, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $52, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $51, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $50, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $49, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrl $15, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_16i1_to_16i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movswq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $60, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $58, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $57, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movsbq %al, %rcx -; AVX2-NEXT: shrl $7, %ecx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $55, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $54, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $53, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $52, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $51, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $50, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $49, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrl $15, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_16i1_to_16i8: @@ -3242,160 +2434,24 @@ ; ; X32-SSE2-LABEL: load_sext_16i1_to_16i8: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx -; X32-SSE2-NEXT: pushl %edi -; X32-SSE2-NEXT: pushl %esi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movswl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shrl $15, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm2 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shll $17, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: shll $18, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm1 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: shll $19, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $20, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: shll $21, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm6 -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: shll $22, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shll $23, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm5 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-SSE2-NEXT: shll $28, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm2 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movsbl %al, %eax -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; X32-SSE2-NEXT: shll $30, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; X32-SSE2-NEXT: shll $31, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X32-SSE2-NEXT: shll $26, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: shll $27, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X32-SSE2-NEXT: shll $25, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm1 -; X32-SSE2-NEXT: shrl $7, %eax -; X32-SSE2-NEXT: movd %eax, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; X32-SSE2-NEXT: popl %esi -; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movswl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm0 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; X32-SSE41-NEXT: movsbl %al, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $23, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $22, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $21, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $20, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $19, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $18, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $17, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE41-NEXT: pand %xmm1, %xmm0 +; X32-SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE41-NEXT: retl entry: %X = load <16 x i1>, <16 x i1>* %ptr @@ -3404,442 +2460,43 @@ } define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { -; SSE2-LABEL: load_sext_16i1_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $15, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $14, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $13, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $12, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $11, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $10, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $9, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psllw $15, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_16i1_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $15, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $14, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $13, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $12, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $11, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $10, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $9, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $8, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psllw $15, %xmm0 -; SSSE3-NEXT: psraw $15, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psllw $15, %xmm1 -; SSSE3-NEXT: psraw $15, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_16i1_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: movl %eax, %edx -; SSE41-NEXT: andl $1, %edx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $3, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $5, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $6, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $8, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $9, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $10, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $11, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $12, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $13, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $14, %ecx -; SSE41-NEXT: andl $1, %ecx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psllw $15, %xmm1 -; SSE41-NEXT: psraw $15, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_16i1_to_16i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_16i1_to_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: .cfi_def_cfa_offset 24 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: .cfi_def_cfa_offset 32 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: .cfi_def_cfa_offset 40 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: .cfi_def_cfa_offset 56 -; AVX1-NEXT: .cfi_offset %rbx, -56 -; AVX1-NEXT: .cfi_offset %r12, -48 -; AVX1-NEXT: .cfi_offset %r13, -40 -; AVX1-NEXT: .cfi_offset %r14, -32 -; AVX1-NEXT: .cfi_offset %r15, -24 -; AVX1-NEXT: .cfi_offset %rbp, -16 -; AVX1-NEXT: movswq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $55, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rax, %r11 -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movl %eax, %r9d -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: movsbq %al, %rbp -; AVX1-NEXT: shlq $54, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shlq $53, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $52, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $51, %r11 -; AVX1-NEXT: sarq $63, %r11 -; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $50, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $49, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: shrl $15, %r9d -; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: shlq $63, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vmovd %r13d, %xmm1 -; AVX1-NEXT: shlq $62, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $61, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $59, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $58, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $57, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shrl $7, %ebp -; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: .cfi_def_cfa_offset 40 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: .cfi_def_cfa_offset 32 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: .cfi_def_cfa_offset 24 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_16i1_to_16i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: .cfi_def_cfa_offset 24 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: .cfi_def_cfa_offset 32 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: .cfi_def_cfa_offset 40 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 56 -; AVX2-NEXT: .cfi_offset %rbx, -56 -; AVX2-NEXT: .cfi_offset %r12, -48 -; AVX2-NEXT: .cfi_offset %r13, -40 -; AVX2-NEXT: .cfi_offset %r14, -32 -; AVX2-NEXT: .cfi_offset %r15, -24 -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movswq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $55, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movl %eax, %r9d -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: movsbq %al, %rbp -; AVX2-NEXT: shlq $54, %rax -; AVX2-NEXT: sarq $63, %rax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: shlq $53, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $52, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $51, %r11 -; AVX2-NEXT: sarq $63, %r11 -; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $50, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $49, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrl $15, %r9d -; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: shlq $63, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vmovd %r13d, %xmm1 -; AVX2-NEXT: shlq $62, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $61, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $59, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $58, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $57, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shrl $7, %ebp -; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: .cfi_def_cfa_offset 40 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: .cfi_def_cfa_offset 32 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: .cfi_def_cfa_offset 24 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_16i1_to_16i16: @@ -3856,165 +2513,20 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_16i1_to_16i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $15, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $14, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $13, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $12, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $11, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $10, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $9, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $8, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $7, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $6, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $5, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $4, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: andl $1, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: andl $1, %eax -; X32-SSE2-NEXT: movd %eax, %xmm4 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psllw $15, %xmm0 -; X32-SSE2-NEXT: psraw $15, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: psllw $15, %xmm1 -; X32-SSE2-NEXT: psraw $15, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_16i1_to_16i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzwl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: andl $1, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $3, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $4, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $5, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $6, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $8, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $9, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $10, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $11, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $12, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $13, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $14, %ecx -; X32-SSE41-NEXT: andl $1, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; X32-SSE41-NEXT: psllw $15, %xmm0 -; X32-SSE41-NEXT: psraw $15, %xmm0 -; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE41-NEXT: psllw $15, %xmm1 -; X32-SSE41-NEXT: psraw $15, %xmm1 -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_16i1_to_16i16: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; X32-SSE-NEXT: retl entry: %X = load <16 x i1>, <16 x i1>* %ptr %Y = sext <16 x i1> %X to <16 x i16> @@ -4022,762 +2534,49 @@ } define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { -; SSE2-LABEL: load_sext_32i1_to_32i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r15 -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 -; SSE2-NEXT: pushq %r12 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shrq $15, %rbx -; SSE2-NEXT: movd %ebx, %xmm0 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shlq $49, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm15 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: shlq $50, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm8 -; SSE2-NEXT: shlq $51, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm9 -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm6 -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm10 -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm2 -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm11 -; SSE2-NEXT: shlq $61, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: shlq $62, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: shlq $63, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: shlq $58, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm13 -; SSE2-NEXT: shlq $59, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm7 -; SSE2-NEXT: shlq $57, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: shrq $7, %rax -; SSE2-NEXT: movd %eax, %xmm14 -; SSE2-NEXT: movswq 2(%rdi), %rsi -; SSE2-NEXT: movq %rsi, %r8 -; SSE2-NEXT: movq %rsi, %r9 -; SSE2-NEXT: movq %rsi, %r10 -; SSE2-NEXT: movq %rsi, %r11 -; SSE2-NEXT: movq %rsi, %r14 -; SSE2-NEXT: movq %rsi, %r15 -; SSE2-NEXT: movq %rsi, %r12 -; SSE2-NEXT: movq %rsi, %r13 -; SSE2-NEXT: movq %rsi, %rbx -; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: movq %rsi, %rcx -; SSE2-NEXT: movq %rsi, %rdx -; SSE2-NEXT: movq %rsi, %rdi -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: shrq $15, %rbp -; SSE2-NEXT: movd %ebp, %xmm1 -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: movsbq %sil, %rsi -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE2-NEXT: shlq $49, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: shlq $50, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: shlq $51, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: shlq $52, %r11 -; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: shlq $53, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: shlq $54, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: shlq $55, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: shlq $60, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: shlq $61, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: shlq $62, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: shlq $63, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: shlq $58, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: shlq $59, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: shlq $57, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: shrq $7, %rsi -; SSE2-NEXT: movd %esi, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %r15 -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_32i1_to_32i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: pushq %r15 -; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 -; SSSE3-NEXT: pushq %r12 -; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: shrq $15, %rbx -; SSSE3-NEXT: movd %ebx, %xmm0 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: shlq $49, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm15 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSSE3-NEXT: shlq $50, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm8 -; SSSE3-NEXT: shlq $51, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm9 -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm6 -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm10 -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm2 -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm11 -; SSSE3-NEXT: shlq $61, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: shlq $62, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: shlq $63, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: shlq $58, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm13 -; SSSE3-NEXT: shlq $59, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm7 -; SSSE3-NEXT: shlq $57, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: shrq $7, %rax -; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movswq 2(%rdi), %rsi -; SSSE3-NEXT: movq %rsi, %r8 -; SSSE3-NEXT: movq %rsi, %r9 -; SSSE3-NEXT: movq %rsi, %r10 -; SSSE3-NEXT: movq %rsi, %r11 -; SSSE3-NEXT: movq %rsi, %r14 -; SSSE3-NEXT: movq %rsi, %r15 -; SSSE3-NEXT: movq %rsi, %r12 -; SSSE3-NEXT: movq %rsi, %r13 -; SSSE3-NEXT: movq %rsi, %rbx -; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: movq %rsi, %rcx -; SSSE3-NEXT: movq %rsi, %rdx -; SSSE3-NEXT: movq %rsi, %rdi -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: shrq $15, %rbp -; SSSE3-NEXT: movd %ebp, %xmm1 -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: movsbq %sil, %rsi -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSSE3-NEXT: shlq $49, %r8 -; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSSE3-NEXT: shlq $50, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSSE3-NEXT: shlq $51, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: shlq $52, %r11 -; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: shlq $53, %r14 -; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSSE3-NEXT: shlq $54, %r15 -; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSSE3-NEXT: shlq $55, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: shlq $60, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: shlq $61, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: shlq $62, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSSE3-NEXT: shlq $63, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: shlq $58, %rdx -; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: shlq $59, %rdi -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movd %edi, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: shlq $57, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: shrq $7, %rsi -; SSSE3-NEXT: movd %esi, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSSE3-NEXT: popq %rbx -; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 -; SSSE3-NEXT: popq %r14 -; SSSE3-NEXT: popq %r15 -; SSSE3-NEXT: popq %rbp -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_32i1_to_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movswq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: movswq 2(%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm1 -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $60, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $59, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $58, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $57, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrl $7, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $55, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $54, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $53, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $52, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $51, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $50, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $49, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrl $15, %eax -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_32i1_to_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_32i1_to_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: movslq (%rdi), %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $47, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rax, %r11 -; AVX1-NEXT: movq %rax, %r9 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $46, %rbp -; AVX1-NEXT: sarq $63, %rbp -; AVX1-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $45, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: shlq $44, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $43, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $42, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: shlq $41, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r13 -; AVX1-NEXT: shlq $40, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: shlq $39, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: shlq $38, %r11 -; AVX1-NEXT: sarq $63, %r11 -; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: movsbq %al, %r11 -; AVX1-NEXT: shlq $37, %r9 -; AVX1-NEXT: sarq $63, %r9 -; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r9 -; AVX1-NEXT: shlq $36, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: shlq $35, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: shlq $34, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: shlq $33, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %r12 -; AVX1-NEXT: shrq $31, %rbp -; AVX1-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rbp -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movswq %ax, %rax -; AVX1-NEXT: shlq $62, %r8 -; AVX1-NEXT: sarq $63, %r8 -; AVX1-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $60, %rdi -; AVX1-NEXT: sarq $63, %rdi -; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $59, %r13 -; AVX1-NEXT: sarq $63, %r13 -; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $58, %rsi -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shlq $57, %r10 -; AVX1-NEXT: sarq $63, %r10 -; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX1-NEXT: shrl $7, %r11d -; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $55, %r9 -; AVX1-NEXT: sarq $63, %r9 -; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $54, %rbx -; AVX1-NEXT: sarq $63, %rbx -; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 -; AVX1-NEXT: shlq $53, %r14 -; AVX1-NEXT: sarq $63, %r14 -; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $52, %r15 -; AVX1-NEXT: sarq $63, %r15 -; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $51, %r12 -; AVX1-NEXT: sarq $63, %r12 -; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 -; AVX1-NEXT: shlq $50, %rbp -; AVX1-NEXT: sarq $63, %rbp -; AVX1-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 -; AVX1-NEXT: shlq $49, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shrl $15, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_32i1_to_32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movslq (%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $47, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $46, %rbp -; AVX2-NEXT: sarq $63, %rbp -; AVX2-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $45, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlq $44, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $43, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $42, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: shlq $41, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r13 -; AVX2-NEXT: shlq $40, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shlq $39, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: shlq $38, %r11 -; AVX2-NEXT: sarq $63, %r11 -; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: movsbq %al, %r11 -; AVX2-NEXT: shlq $37, %r9 -; AVX2-NEXT: sarq $63, %r9 -; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: shlq $36, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shlq $35, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r14 -; AVX2-NEXT: shlq $34, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: shlq $33, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shrq $31, %rbp -; AVX2-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movswq %ax, %rax -; AVX2-NEXT: shlq $62, %r8 -; AVX2-NEXT: sarq $63, %r8 -; AVX2-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $60, %rdi -; AVX2-NEXT: sarq $63, %rdi -; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $59, %r13 -; AVX2-NEXT: sarq $63, %r13 -; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $58, %rsi -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shlq $57, %r10 -; AVX2-NEXT: sarq $63, %r10 -; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX2-NEXT: shrl $7, %r11d -; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $55, %r9 -; AVX2-NEXT: sarq $63, %r9 -; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $54, %rbx -; AVX2-NEXT: sarq $63, %rbx -; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 -; AVX2-NEXT: shlq $53, %r14 -; AVX2-NEXT: sarq $63, %r14 -; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $52, %r15 -; AVX2-NEXT: sarq $63, %r15 -; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $51, %r12 -; AVX2-NEXT: sarq $63, %r12 -; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 -; AVX2-NEXT: shlq $50, %rbp -; AVX2-NEXT: sarq $63, %rbp -; AVX2-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 -; AVX2-NEXT: shlq $49, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shrl $15, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_32i1_to_32i8: @@ -4798,309 +2597,21 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X32-SSE2-LABEL: load_sext_32i1_to_32i8: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pushl %ebp -; X32-SSE2-NEXT: pushl %ebx -; X32-SSE2-NEXT: pushl %edi -; X32-SSE2-NEXT: pushl %esi -; X32-SSE2-NEXT: subl $28, %esp -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movswl (%eax), %edx -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shrl $15, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shll $17, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm4 -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: shll $18, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm1 -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: shll $19, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm2 -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: shll $20, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm5 -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: shll $21, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm6 -; X32-SSE2-NEXT: movl %edx, %ecx -; X32-SSE2-NEXT: shll $22, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %edx, %ebp -; X32-SSE2-NEXT: shll $23, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm3 -; X32-SSE2-NEXT: movl %edx, %esi -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; X32-SSE2-NEXT: shll $28, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm0 -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X32-SSE2-NEXT: shll $29, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm1 -; X32-SSE2-NEXT: movl %edx, %ebx -; X32-SSE2-NEXT: movsbl %dl, %edx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: shll $31, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-SSE2-NEXT: shll $26, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm7 -; X32-SSE2-NEXT: shll $27, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm2 -; X32-SSE2-NEXT: shll $25, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm6 -; X32-SSE2-NEXT: shrl $7, %edx -; X32-SSE2-NEXT: movd %edx, %xmm5 -; X32-SSE2-NEXT: movswl 2(%eax), %eax -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: shrl $15, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm4 -; X32-SSE2-NEXT: movdqu %xmm4, (%esp) # 16-byte Spill -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: shll $17, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm4 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; X32-SSE2-NEXT: shll $18, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X32-SSE2-NEXT: shll $19, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm5 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; X32-SSE2-NEXT: shll $20, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm6 -; X32-SSE2-NEXT: movl %eax, %esi -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: shll $21, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm1 -; X32-SSE2-NEXT: movl %eax, %edi -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; X32-SSE2-NEXT: shll $22, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm3 -; X32-SSE2-NEXT: movl %eax, %ebx -; X32-SSE2-NEXT: movdqu (%esp), %xmm2 # 16-byte Reload -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X32-SSE2-NEXT: shll $23, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm2 -; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; X32-SSE2-NEXT: shll $28, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm7 -; X32-SSE2-NEXT: movl %eax, %ebp -; X32-SSE2-NEXT: movsbl %al, %eax -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE2-NEXT: shll $30, %esi -; X32-SSE2-NEXT: sarl $31, %esi -; X32-SSE2-NEXT: movd %esi, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X32-SSE2-NEXT: shll $31, %edi -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: movd %edi, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; X32-SSE2-NEXT: shll $26, %ebx -; X32-SSE2-NEXT: sarl $31, %ebx -; X32-SSE2-NEXT: movd %ebx, %xmm5 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: shll $27, %edx -; X32-SSE2-NEXT: sarl $31, %edx -; X32-SSE2-NEXT: movd %edx, %xmm3 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; X32-SSE2-NEXT: shll $25, %ebp -; X32-SSE2-NEXT: sarl $31, %ebp -; X32-SSE2-NEXT: movd %ebp, %xmm4 -; X32-SSE2-NEXT: shrl $7, %eax -; X32-SSE2-NEXT: movd %eax, %xmm5 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE2-NEXT: addl $28, %esp -; X32-SSE2-NEXT: popl %esi -; X32-SSE2-NEXT: popl %edi -; X32-SSE2-NEXT: popl %ebx -; X32-SSE2-NEXT: popl %ebp -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_32i1_to_32i8: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pushl %esi -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movswl (%eax), %ecx -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $30, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movl %ecx, %esi -; X32-SSE41-NEXT: shll $31, %esi -; X32-SSE41-NEXT: sarl $31, %esi -; X32-SSE41-NEXT: movd %esi, %xmm0 -; X32-SSE41-NEXT: pinsrb $1, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $29, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $2, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $28, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $3, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $27, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $4, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $26, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $5, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $25, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $6, %edx, %xmm0 -; X32-SSE41-NEXT: movsbl %cl, %edx -; X32-SSE41-NEXT: shrl $7, %edx -; X32-SSE41-NEXT: pinsrb $7, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $23, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $8, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $22, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $9, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $21, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $10, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $20, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $11, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $19, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $12, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $18, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $13, %edx, %xmm0 -; X32-SSE41-NEXT: movl %ecx, %edx -; X32-SSE41-NEXT: shll $17, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: pinsrb $14, %edx, %xmm0 -; X32-SSE41-NEXT: shrl $15, %ecx -; X32-SSE41-NEXT: pinsrb $15, %ecx, %xmm0 -; X32-SSE41-NEXT: movswl 2(%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: sarl $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm1 -; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $28, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $27, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $26, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $25, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 -; X32-SSE41-NEXT: movsbl %al, %ecx -; X32-SSE41-NEXT: shrl $7, %ecx -; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $23, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $22, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $21, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $20, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $19, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $18, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $17, %ecx -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $15, %eax -; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; X32-SSE41-NEXT: popl %esi -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: load_sext_32i1_to_32i8: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: retl entry: %X = load <32 x i1>, <32 x i1>* %ptr %Y = sext <32 x i1> %X to <32 x i8>