Index: llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ llvm/trunk/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -173,15 +173,30 @@ return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %mask_1 = and i16 %scalar_mask, i32 1 << Idx + // %cond = icmp ne i16 %mask_1, 0 // br i1 %mask_1, label %cond.load, label %else // - - Value *Predicate = Builder.CreateExtractElement(Mask, Idx); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx); + } // Create "cond" block // @@ -290,13 +305,29 @@ return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %mask_1 = and i16 %scalar_mask, i32 1 << Idx + // %cond = icmp ne i16 %mask_1, 0 // br i1 %mask_1, label %cond.store, label %else // - Value *Predicate = Builder.CreateExtractElement(Mask, Idx); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx); + } // Create "cond" block // @@ -392,15 +423,30 @@ return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // - // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %Mask1 = and i16 %scalar_mask, i32 1 << Idx + // %cond = icmp ne i16 %mask_1, 0 // br i1 %Mask1, label %cond.load, label %else // - Value *Predicate = - Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + } // Create "cond" block // @@ -499,14 +545,29 @@ return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // - // %Mask1 = extractelement <16 x i1> %Mask, i32 Idx + // %Mask1 = and i16 %scalar_mask, i32 1 << Idx + // %cond = icmp ne i16 %mask_1, 0 // br i1 %Mask1, label %cond.store, label %else // - Value *Predicate = - Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + } // Create "cond" block // Index: llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll +++ llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll @@ -31,22 +31,26 @@ ; NOGATHER-LABEL: masked_gather_v2i32: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB0_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB0_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: jne .LBB0_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB0_3 +; NOGATHER-NEXT: .LBB0_4: # %else2 +; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB0_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: movl (%rcx), %ecx +; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1 +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB0_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB0_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: movl (%rax), %eax ; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB0_4: # %else2 ; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: @@ -80,22 +84,26 @@ ; NOGATHER-LABEL: masked_gather_v2i32_concat: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB1_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: movl (%rax), %eax -; NOGATHER-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB1_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: jne .LBB1_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB1_3 +; NOGATHER-NEXT: .LBB1_4: # %else2 +; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB1_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: movl (%rcx), %ecx +; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1 +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB1_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB1_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: movl (%rax), %eax ; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB1_4: # %else2 ; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; NOGATHER-NEXT: retq entry: @@ -130,21 +138,25 @@ ; NOGATHER-LABEL: masked_gather_v2float: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB2_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; NOGATHER-NEXT: .LBB2_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: jne .LBB2_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB2_3 +; NOGATHER-NEXT: .LBB2_4: # %else2 +; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB2_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB2_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB2_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; NOGATHER-NEXT: .LBB2_4: # %else2 ; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: @@ -176,21 +188,25 @@ ; NOGATHER-LABEL: masked_gather_v2float_concat: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB3_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; NOGATHER-NEXT: .LBB3_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: jne .LBB3_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB3_3 +; NOGATHER-NEXT: .LBB3_4: # %else2 +; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB3_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB3_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB3_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; NOGATHER-NEXT: .LBB3_4: # %else2 ; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: @@ -221,35 +237,38 @@ ; ; NOGATHER-LABEL: masked_gather_v4i32: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax +; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm1 +; NOGATHER-NEXT: vmovmskps %xmm1, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB4_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm0, %rax -; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB4_2: # %else -; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB4_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB4_4: # %else2 -; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB4_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rax -; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 -; NOGATHER-NEXT: .LBB4_6: # %else5 -; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB4_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB4_7 +; NOGATHER-NEXT: .LBB4_8: # %else8 +; NOGATHER-NEXT: vmovdqa %xmm2, %xmm0 +; NOGATHER-NEXT: vzeroupper +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB4_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB4_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: .LBB4_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm2 -; NOGATHER-NEXT: .LBB4_8: # %else8 ; NOGATHER-NEXT: vmovdqa %xmm2, %xmm0 ; NOGATHER-NEXT: vzeroupper ; NOGATHER-NEXT: retq @@ -278,36 +297,39 @@ ; ; NOGATHER-LABEL: masked_gather_v4float: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax +; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm1 +; NOGATHER-NEXT: vmovmskps %xmm1, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB5_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm0, %rax -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; NOGATHER-NEXT: .LBB5_2: # %else -; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB5_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; NOGATHER-NEXT: .LBB5_4: # %else2 -; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB5_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rax +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB5_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB5_7 +; NOGATHER-NEXT: .LBB5_8: # %else8 +; NOGATHER-NEXT: vmovaps %xmm2, %xmm0 +; NOGATHER-NEXT: vzeroupper +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB5_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; NOGATHER-NEXT: .LBB5_6: # %else5 -; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB5_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: .LBB5_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; NOGATHER-NEXT: .LBB5_8: # %else8 ; NOGATHER-NEXT: vmovaps %xmm2, %xmm0 ; NOGATHER-NEXT: vzeroupper ; NOGATHER-NEXT: retq @@ -347,76 +369,82 @@ ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 ; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax +; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 +; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rax -; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm1, %xmm4 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_2: # %else -; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB6_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm4 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_4: # %else2 -; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB6_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm3, %rax -; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm4 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: .LBB6_6: # %else5 -; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB6_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm3 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB6_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB6_7 ; NOGATHER-NEXT: .LBB6_8: # %else8 -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB6_10 -; NOGATHER-NEXT: # %bb.9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3 -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $16, %al +; NOGATHER-NEXT: jne .LBB6_9 ; NOGATHER-NEXT: .LBB6_10: # %else11 -; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB6_12 -; NOGATHER-NEXT: # %bb.11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3 -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NOGATHER-NEXT: .LBB6_11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 +; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 +; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_12: # %else14 -; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB6_14 -; NOGATHER-NEXT: # %bb.13: # %cond.load16 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB6_14: # %else17 -; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: testb $64, %al +; NOGATHER-NEXT: jne .LBB6_13 +; NOGATHER-NEXT: # %bb.14: # %else17 +; NOGATHER-NEXT: testb $-128, %al +; NOGATHER-NEXT: jne .LBB6_15 +; NOGATHER-NEXT: .LBB6_16: # %else20 +; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB6_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: je .LBB6_8 +; NOGATHER-NEXT: .LBB6_7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: testb $16, %al +; NOGATHER-NEXT: je .LBB6_10 +; NOGATHER-NEXT: .LBB6_9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 +; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0 +; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $32, %al +; NOGATHER-NEXT: jne .LBB6_11 +; NOGATHER-NEXT: jmp .LBB6_12 +; NOGATHER-NEXT: .LBB6_13: # %cond.load16 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB6_16 -; NOGATHER-NEXT: # %bb.15: # %cond.load19 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax +; NOGATHER-NEXT: .LBB6_15: # %cond.load19 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0 ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB6_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -456,77 +484,83 @@ ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 ; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax +; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 +; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rax -; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7] +; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; NOGATHER-NEXT: .LBB7_2: # %else -; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB7_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0],mem[0],xmm1[2,3] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_4: # %else2 -; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB7_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm3, %rax -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: .LBB7_6: # %else5 -; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB7_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB7_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB7_7 ; NOGATHER-NEXT: .LBB7_8: # %else8 -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB7_10 -; NOGATHER-NEXT: # %bb.9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $16, %al +; NOGATHER-NEXT: jne .LBB7_9 ; NOGATHER-NEXT: .LBB7_10: # %else11 -; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 -; NOGATHER-NEXT: # %bb.11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NOGATHER-NEXT: .LBB7_11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_12: # %else14 -; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB7_14 -; NOGATHER-NEXT: # %bb.13: # %cond.load16 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB7_14: # %else17 -; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: testb $64, %al +; NOGATHER-NEXT: jne .LBB7_13 +; NOGATHER-NEXT: # %bb.14: # %else17 +; NOGATHER-NEXT: testb $-128, %al +; NOGATHER-NEXT: jne .LBB7_15 +; NOGATHER-NEXT: .LBB7_16: # %else20 +; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB7_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: je .LBB7_8 +; NOGATHER-NEXT: .LBB7_7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: testb $16, %al +; NOGATHER-NEXT: je .LBB7_10 +; NOGATHER-NEXT: .LBB7_9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 +; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $32, %al +; NOGATHER-NEXT: jne .LBB7_11 +; NOGATHER-NEXT: jmp .LBB7_12 +; NOGATHER-NEXT: .LBB7_13: # %cond.load16 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB7_16 -; NOGATHER-NEXT: # %bb.15: # %cond.load19 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax +; NOGATHER-NEXT: .LBB7_15: # %cond.load19 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB7_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -560,41 +594,43 @@ ; NOGATHER-LABEL: masked_gather_v4i64: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskps %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB8_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm3 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB8_2: # %else -; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB8_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm3 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx +; NOGATHER-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB8_4: # %else2 -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB8_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm3, %xmm3 -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB8_6: # %else5 -; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB8_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB8_7 +; NOGATHER-NEXT: .LBB8_8: # %else8 +; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB8_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB8_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax +; NOGATHER-NEXT: .LBB8_7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB8_8: # %else8 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -628,41 +664,43 @@ ; NOGATHER-LABEL: masked_gather_v4double: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax +; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskps %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB9_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; NOGATHER-NEXT: .LBB9_2: # %else -; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB9_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx +; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm1[0,1],mem[0,1] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB9_4: # %else2 -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB9_6 -; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB9_6: # %else5 -; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: testb $4, %al +; NOGATHER-NEXT: jne .LBB9_5 +; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: testb $8, %al +; NOGATHER-NEXT: jne .LBB9_7 +; NOGATHER-NEXT: .LBB9_8: # %else8 +; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB9_5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB9_8 -; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax +; NOGATHER-NEXT: .LBB9_7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: .LBB9_8: # %else8 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -694,20 +732,24 @@ ; NOGATHER-LABEL: masked_gather_v2i64: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB10_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB10_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: jne .LBB10_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB10_3 +; NOGATHER-NEXT: .LBB10_4: # %else2 +; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB10_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx +; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm1, %xmm1 +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB10_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB10_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm1 -; NOGATHER-NEXT: .LBB10_4: # %else2 ; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: @@ -739,20 +781,24 @@ ; NOGATHER-LABEL: masked_gather_v2double: ; NOGATHER: # %bb.0: # %entry ; NOGATHER-NEXT: vmovdqa (%rdi), %xmm2 -; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax +; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOGATHER-NEXT: vmovmskpd %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al -; NOGATHER-NEXT: je .LBB11_2 -; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm2, %rax +; NOGATHER-NEXT: jne .LBB11_1 +; NOGATHER-NEXT: # %bb.2: # %else +; NOGATHER-NEXT: testb $2, %al +; NOGATHER-NEXT: jne .LBB11_3 +; NOGATHER-NEXT: .LBB11_4: # %else2 +; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 +; NOGATHER-NEXT: retq +; NOGATHER-NEXT: .LBB11_1: # %cond.load +; NOGATHER-NEXT: vmovq %xmm2, %rcx ; NOGATHER-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; NOGATHER-NEXT: .LBB11_2: # %else -; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax -; NOGATHER-NEXT: testb $1, %al +; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB11_4 -; NOGATHER-NEXT: # %bb.3: # %cond.load1 +; NOGATHER-NEXT: .LBB11_3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; NOGATHER-NEXT: .LBB11_4: # %else2 ; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: Index: llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -158,13 +158,12 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovb2m %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vpmovmskb %xmm0, %ecx ; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: je LBB12_1 ; CHECK-NEXT: ## %bb.2: ## %cond.load -; CHECK-NEXT: movswl (%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 +; CHECK-NEXT: movswl (%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm8 ; CHECK-NEXT: jmp LBB12_3 ; CHECK-NEXT: LBB12_1: @@ -172,13 +171,11 @@ ; CHECK-NEXT: LBB12_3: ## %else ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vxorps %xmm9, %xmm9, %xmm9 -; CHECK-NEXT: kshiftrw $1, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: testb $2, %cl ; CHECK-NEXT: je LBB12_4 ; CHECK-NEXT: ## %bb.5: ## %cond.load1 -; CHECK-NEXT: movswl 2(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 +; CHECK-NEXT: movswl 2(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vmovaps %xmm2, %xmm1 ; CHECK-NEXT: vmovaps %xmm2, %xmm7 ; CHECK-NEXT: vmovaps %xmm2, %xmm6 @@ -193,7 +190,9 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm11 ; CHECK-NEXT: vmovaps %xmm2, %xmm10 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2 -; CHECK-NEXT: jmp LBB12_6 +; CHECK-NEXT: testb $4, %cl +; CHECK-NEXT: jne LBB12_7 +; CHECK-NEXT: jmp LBB12_8 ; CHECK-NEXT: LBB12_4: ; CHECK-NEXT: vmovaps %xmm2, %xmm1 ; CHECK-NEXT: vmovaps %xmm2, %xmm7 @@ -208,129 +207,52 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm12 ; CHECK-NEXT: vmovaps %xmm2, %xmm11 ; CHECK-NEXT: vmovaps %xmm2, %xmm10 -; CHECK-NEXT: LBB12_6: ## %else2 -; CHECK-NEXT: kshiftrw $2, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: je LBB12_8 -; CHECK-NEXT: ## %bb.7: ## %cond.load4 -; CHECK-NEXT: movswl 4(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 +; CHECK-NEXT: LBB12_7: ## %cond.load4 +; CHECK-NEXT: movswl 4(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 ; CHECK-NEXT: LBB12_8: ## %else5 -; CHECK-NEXT: kshiftrw $3, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_10 -; CHECK-NEXT: ## %bb.9: ## %cond.load7 -; CHECK-NEXT: movswl 6(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7 -; CHECK-NEXT: LBB12_10: ## %else8 -; CHECK-NEXT: kshiftrw $4, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_12 -; CHECK-NEXT: ## %bb.11: ## %cond.load10 -; CHECK-NEXT: movswl 8(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6 +; CHECK-NEXT: testb $8, %cl +; CHECK-NEXT: jne LBB12_9 +; CHECK-NEXT: ## %bb.10: ## %else8 +; CHECK-NEXT: testb $16, %cl +; CHECK-NEXT: jne LBB12_11 ; CHECK-NEXT: LBB12_12: ## %else11 -; CHECK-NEXT: kshiftrw $5, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_14 -; CHECK-NEXT: ## %bb.13: ## %cond.load13 -; CHECK-NEXT: movswl 10(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5 +; CHECK-NEXT: testb $32, %cl +; CHECK-NEXT: jne LBB12_13 ; CHECK-NEXT: LBB12_14: ## %else14 -; CHECK-NEXT: kshiftrw $6, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_16 -; CHECK-NEXT: ## %bb.15: ## %cond.load16 -; CHECK-NEXT: movswl 12(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4 +; CHECK-NEXT: testb $64, %cl +; CHECK-NEXT: jne LBB12_15 ; CHECK-NEXT: LBB12_16: ## %else17 -; CHECK-NEXT: kshiftrw $7, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_18 -; CHECK-NEXT: ## %bb.17: ## %cond.load19 -; CHECK-NEXT: movswl 14(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3 +; CHECK-NEXT: testb $-128, %cl +; CHECK-NEXT: jne LBB12_17 ; CHECK-NEXT: LBB12_18: ## %else20 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_20 -; CHECK-NEXT: ## %bb.19: ## %cond.load22 -; CHECK-NEXT: movswl 16(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 +; CHECK-NEXT: jne LBB12_19 ; CHECK-NEXT: LBB12_20: ## %else23 -; CHECK-NEXT: kshiftrw $9, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_22 -; CHECK-NEXT: ## %bb.21: ## %cond.load25 -; CHECK-NEXT: movswl 18(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 +; CHECK-NEXT: jne LBB12_21 ; CHECK-NEXT: LBB12_22: ## %else26 -; CHECK-NEXT: kshiftrw $10, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_24 -; CHECK-NEXT: ## %bb.23: ## %cond.load28 -; CHECK-NEXT: movswl 20(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 +; CHECK-NEXT: jne LBB12_23 ; CHECK-NEXT: LBB12_24: ## %else29 -; CHECK-NEXT: kshiftrw $11, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_26 -; CHECK-NEXT: ## %bb.25: ## %cond.load31 -; CHECK-NEXT: movswl 22(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 +; CHECK-NEXT: jne LBB12_25 ; CHECK-NEXT: LBB12_26: ## %else32 -; CHECK-NEXT: kshiftrw $12, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_28 -; CHECK-NEXT: ## %bb.27: ## %cond.load34 -; CHECK-NEXT: movswl 24(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12 +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: jne LBB12_27 ; CHECK-NEXT: LBB12_28: ## %else35 -; CHECK-NEXT: kshiftrw $13, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_30 -; CHECK-NEXT: ## %bb.29: ## %cond.load37 -; CHECK-NEXT: movswl 26(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 +; CHECK-NEXT: jne LBB12_29 ; CHECK-NEXT: LBB12_30: ## %else38 -; CHECK-NEXT: kshiftrw $14, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_32 -; CHECK-NEXT: ## %bb.31: ## %cond.load40 -; CHECK-NEXT: movswl 28(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 +; CHECK-NEXT: jne LBB12_31 ; CHECK-NEXT: LBB12_32: ## %else41 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovd %k0, %ecx -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 ; CHECK-NEXT: je LBB12_34 -; CHECK-NEXT: ## %bb.33: ## %cond.load43 +; CHECK-NEXT: LBB12_33: ## %cond.load43 ; CHECK-NEXT: movswl 30(%rsi), %ecx ; CHECK-NEXT: vmovd %ecx, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm9 @@ -384,6 +306,79 @@ ; CHECK-NEXT: vmovd %xmm0, %ecx ; CHECK-NEXT: movw %cx, 30(%rax) ; CHECK-NEXT: retq +; CHECK-NEXT: LBB12_9: ## %cond.load7 +; CHECK-NEXT: movswl 6(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7 +; CHECK-NEXT: testb $16, %cl +; CHECK-NEXT: je LBB12_12 +; CHECK-NEXT: LBB12_11: ## %cond.load10 +; CHECK-NEXT: movswl 8(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6 +; CHECK-NEXT: testb $32, %cl +; CHECK-NEXT: je LBB12_14 +; CHECK-NEXT: LBB12_13: ## %cond.load13 +; CHECK-NEXT: movswl 10(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5 +; CHECK-NEXT: testb $64, %cl +; CHECK-NEXT: je LBB12_16 +; CHECK-NEXT: LBB12_15: ## %cond.load16 +; CHECK-NEXT: movswl 12(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4 +; CHECK-NEXT: testb $-128, %cl +; CHECK-NEXT: je LBB12_18 +; CHECK-NEXT: LBB12_17: ## %cond.load19 +; CHECK-NEXT: movswl 14(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 +; CHECK-NEXT: je LBB12_20 +; CHECK-NEXT: LBB12_19: ## %cond.load22 +; CHECK-NEXT: movswl 16(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 +; CHECK-NEXT: je LBB12_22 +; CHECK-NEXT: LBB12_21: ## %cond.load25 +; CHECK-NEXT: movswl 18(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 +; CHECK-NEXT: je LBB12_24 +; CHECK-NEXT: LBB12_23: ## %cond.load28 +; CHECK-NEXT: movswl 20(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 +; CHECK-NEXT: je LBB12_26 +; CHECK-NEXT: LBB12_25: ## %cond.load31 +; CHECK-NEXT: movswl 22(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13 +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: je LBB12_28 +; CHECK-NEXT: LBB12_27: ## %cond.load34 +; CHECK-NEXT: movswl 24(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 +; CHECK-NEXT: je LBB12_30 +; CHECK-NEXT: LBB12_29: ## %cond.load37 +; CHECK-NEXT: movswl 26(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 +; CHECK-NEXT: je LBB12_32 +; CHECK-NEXT: LBB12_31: ## %cond.load40 +; CHECK-NEXT: movswl 28(%rsi), %edx +; CHECK-NEXT: vmovd %edx, %xmm0 +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10 +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 +; CHECK-NEXT: jne LBB12_33 +; CHECK-NEXT: jmp LBB12_34 %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer) ret <16 x half> %res } @@ -394,159 +389,159 @@ ; CHECK-LABEL: test_mask_store_16xf16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovb2m %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vpmovmskb %xmm0, %eax ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je LBB13_2 -; CHECK-NEXT: ## %bb.1: ## %cond.store +; CHECK-NEXT: jne LBB13_1 +; CHECK-NEXT: ## %bb.2: ## %else +; CHECK-NEXT: testb $2, %al +; CHECK-NEXT: jne LBB13_3 +; CHECK-NEXT: LBB13_4: ## %else2 +; CHECK-NEXT: testb $4, %al +; CHECK-NEXT: jne LBB13_5 +; CHECK-NEXT: LBB13_6: ## %else4 +; CHECK-NEXT: testb $8, %al +; CHECK-NEXT: jne LBB13_7 +; CHECK-NEXT: LBB13_8: ## %else6 +; CHECK-NEXT: testb $16, %al +; CHECK-NEXT: jne LBB13_9 +; CHECK-NEXT: LBB13_10: ## %else8 +; CHECK-NEXT: testb $32, %al +; CHECK-NEXT: jne LBB13_11 +; CHECK-NEXT: LBB13_12: ## %else10 +; CHECK-NEXT: testb $64, %al +; CHECK-NEXT: jne LBB13_13 +; CHECK-NEXT: LBB13_14: ## %else12 +; CHECK-NEXT: testb $-128, %al +; CHECK-NEXT: jne LBB13_15 +; CHECK-NEXT: LBB13_16: ## %else14 +; CHECK-NEXT: testl $256, %eax ## imm = 0x100 +; CHECK-NEXT: jne LBB13_17 +; CHECK-NEXT: LBB13_18: ## %else16 +; CHECK-NEXT: testl $512, %eax ## imm = 0x200 +; CHECK-NEXT: jne LBB13_19 +; CHECK-NEXT: LBB13_20: ## %else18 +; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 +; CHECK-NEXT: jne LBB13_21 +; CHECK-NEXT: LBB13_22: ## %else20 +; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 +; CHECK-NEXT: jne LBB13_23 +; CHECK-NEXT: LBB13_24: ## %else22 +; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 +; CHECK-NEXT: jne LBB13_25 +; CHECK-NEXT: LBB13_26: ## %else24 +; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 +; CHECK-NEXT: jne LBB13_27 +; CHECK-NEXT: LBB13_28: ## %else26 +; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 +; CHECK-NEXT: jne LBB13_29 +; CHECK-NEXT: LBB13_30: ## %else28 +; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 +; CHECK-NEXT: jne LBB13_31 +; CHECK-NEXT: LBB13_32: ## %else30 +; CHECK-NEXT: retq +; CHECK-NEXT: LBB13_1: ## %cond.store ; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, (%rdi) -; CHECK-NEXT: LBB13_2: ## %else -; CHECK-NEXT: kshiftrw $1, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: testb $2, %al ; CHECK-NEXT: je LBB13_4 -; CHECK-NEXT: ## %bb.3: ## %cond.store1 +; CHECK-NEXT: LBB13_3: ## %cond.store1 ; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 2(%rdi) -; CHECK-NEXT: LBB13_4: ## %else2 -; CHECK-NEXT: kshiftrw $2, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 2(%rdi) +; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: je LBB13_6 -; CHECK-NEXT: ## %bb.5: ## %cond.store3 +; CHECK-NEXT: LBB13_5: ## %cond.store3 ; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 4(%rdi) -; CHECK-NEXT: LBB13_6: ## %else4 -; CHECK-NEXT: kshiftrw $3, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: testb $8, %al ; CHECK-NEXT: je LBB13_8 -; CHECK-NEXT: ## %bb.7: ## %cond.store5 +; CHECK-NEXT: LBB13_7: ## %cond.store5 ; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 6(%rdi) -; CHECK-NEXT: LBB13_8: ## %else6 -; CHECK-NEXT: kshiftrw $4, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 6(%rdi) +; CHECK-NEXT: testb $16, %al ; CHECK-NEXT: je LBB13_10 -; CHECK-NEXT: ## %bb.9: ## %cond.store7 +; CHECK-NEXT: LBB13_9: ## %cond.store7 ; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 8(%rdi) -; CHECK-NEXT: LBB13_10: ## %else8 -; CHECK-NEXT: kshiftrw $5, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 8(%rdi) +; CHECK-NEXT: testb $32, %al ; CHECK-NEXT: je LBB13_12 -; CHECK-NEXT: ## %bb.11: ## %cond.store9 +; CHECK-NEXT: LBB13_11: ## %cond.store9 ; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 10(%rdi) -; CHECK-NEXT: LBB13_12: ## %else10 -; CHECK-NEXT: kshiftrw $6, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 10(%rdi) +; CHECK-NEXT: testb $64, %al ; CHECK-NEXT: je LBB13_14 -; CHECK-NEXT: ## %bb.13: ## %cond.store11 +; CHECK-NEXT: LBB13_13: ## %cond.store11 ; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 12(%rdi) -; CHECK-NEXT: LBB13_14: ## %else12 -; CHECK-NEXT: kshiftrw $7, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 12(%rdi) +; CHECK-NEXT: testb $-128, %al ; CHECK-NEXT: je LBB13_16 -; CHECK-NEXT: ## %bb.15: ## %cond.store13 +; CHECK-NEXT: LBB13_15: ## %cond.store13 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 14(%rdi) -; CHECK-NEXT: LBB13_16: ## %else14 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 14(%rdi) +; CHECK-NEXT: testl $256, %eax ## imm = 0x100 ; CHECK-NEXT: je LBB13_18 -; CHECK-NEXT: ## %bb.17: ## %cond.store15 +; CHECK-NEXT: LBB13_17: ## %cond.store15 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 16(%rdi) -; CHECK-NEXT: LBB13_18: ## %else16 -; CHECK-NEXT: kshiftrw $9, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 16(%rdi) +; CHECK-NEXT: testl $512, %eax ## imm = 0x200 ; CHECK-NEXT: je LBB13_20 -; CHECK-NEXT: ## %bb.19: ## %cond.store17 +; CHECK-NEXT: LBB13_19: ## %cond.store17 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 18(%rdi) -; CHECK-NEXT: LBB13_20: ## %else18 -; CHECK-NEXT: kshiftrw $10, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 18(%rdi) +; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 ; CHECK-NEXT: je LBB13_22 -; CHECK-NEXT: ## %bb.21: ## %cond.store19 +; CHECK-NEXT: LBB13_21: ## %cond.store19 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 20(%rdi) -; CHECK-NEXT: LBB13_22: ## %else20 -; CHECK-NEXT: kshiftrw $11, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 20(%rdi) +; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 ; CHECK-NEXT: je LBB13_24 -; CHECK-NEXT: ## %bb.23: ## %cond.store21 +; CHECK-NEXT: LBB13_23: ## %cond.store21 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 22(%rdi) -; CHECK-NEXT: LBB13_24: ## %else22 -; CHECK-NEXT: kshiftrw $12, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 22(%rdi) +; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 ; CHECK-NEXT: je LBB13_26 -; CHECK-NEXT: ## %bb.25: ## %cond.store23 +; CHECK-NEXT: LBB13_25: ## %cond.store23 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 24(%rdi) -; CHECK-NEXT: LBB13_26: ## %else24 -; CHECK-NEXT: kshiftrw $13, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 24(%rdi) +; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 ; CHECK-NEXT: je LBB13_28 -; CHECK-NEXT: ## %bb.27: ## %cond.store25 +; CHECK-NEXT: LBB13_27: ## %cond.store25 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 26(%rdi) -; CHECK-NEXT: LBB13_28: ## %else26 -; CHECK-NEXT: kshiftrw $14, %k0, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 26(%rdi) +; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 ; CHECK-NEXT: je LBB13_30 -; CHECK-NEXT: ## %bb.29: ## %cond.store27 +; CHECK-NEXT: LBB13_29: ## %cond.store27 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movw %ax, 28(%rdi) -; CHECK-NEXT: LBB13_30: ## %else28 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movw %cx, 28(%rdi) +; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 ; CHECK-NEXT: je LBB13_32 -; CHECK-NEXT: ## %bb.31: ## %cond.store29 +; CHECK-NEXT: LBB13_31: ## %cond.store29 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: movw %ax, 30(%rdi) -; CHECK-NEXT: LBB13_32: ## %else30 ; CHECK-NEXT: retq call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) ret void Index: llvm/trunk/test/CodeGen/X86/masked_gather.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_gather.ll +++ llvm/trunk/test/CodeGen/X86/masked_gather.ll @@ -13,76 +13,79 @@ ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE-NEXT: pextrb $0, %xmm4, %eax -; SSE-NEXT: testb $1, %al -; SSE-NEXT: je .LBB0_2 -; SSE-NEXT: # %bb.1: # %cond.load -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE-NEXT: blendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] -; SSE-NEXT: .LBB0_2: # %else -; SSE-NEXT: pextrb $4, %xmm4, %eax +; SSE-NEXT: movmskps %xmm4, %eax ; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne .LBB0_1 +; SSE-NEXT: # %bb.2: # %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne .LBB0_3 +; SSE-NEXT: .LBB0_4: # %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne .LBB0_5 +; SSE-NEXT: .LBB0_6: # %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne .LBB0_7 +; SSE-NEXT: .LBB0_8: # %else8 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB0_1: # %cond.load +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB0_4 -; SSE-NEXT: # %bb.3: # %cond.load1 -; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: .LBB0_3: # %cond.load1 +; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; SSE-NEXT: .LBB0_4: # %else2 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pextrb $8, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $4, %al ; SSE-NEXT: je .LBB0_6 -; SSE-NEXT: # %bb.5: # %cond.load4 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: .LBB0_5: # %cond.load4 +; SSE-NEXT: movq %xmm1, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; SSE-NEXT: .LBB0_6: # %else5 -; SSE-NEXT: pextrb $12, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB0_8 -; SSE-NEXT: # %bb.7: # %cond.load7 +; SSE-NEXT: .LBB0_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm1, %rax ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; SSE-NEXT: .LBB0_8: # %else8 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: gather_v4f32_ptr_v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB0_2 ; AVX1-NEXT: # %bb.1: # %cond.load -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB0_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB0_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB0_4: # %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB0_6 -; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: jne .LBB0_5 +; AVX1-NEXT: # %bb.6: # %else5 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB0_7 +; AVX1-NEXT: .LBB0_8: # %else8 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB0_5: # %cond.load4 +; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1-NEXT: .LBB0_6: # %else5 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB0_8 -; AVX1-NEXT: # %bb.7: # %cond.load7 +; AVX1-NEXT: .LBB0_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX1-NEXT: .LBB0_8: # %else8 ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -90,39 +93,39 @@ ; AVX2-LABEL: gather_v4f32_ptr_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: # %cond.load -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB0_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB0_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB0_4: # %else2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB0_6 -; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: jne .LBB0_5 +; AVX2-NEXT: # %bb.6: # %else5 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB0_7 +; AVX2-NEXT: .LBB0_8: # %else8 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB0_5: # %cond.load4 +; AVX2-NEXT: vmovq %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX2-NEXT: .LBB0_6: # %else5 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB0_8 -; AVX2-NEXT: # %bb.7: # %cond.load7 +; AVX2-NEXT: .LBB0_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX2-NEXT: .LBB0_8: # %else8 ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -156,39 +159,38 @@ ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE-NEXT: pextrb $0, %xmm5, %eax +; SSE-NEXT: movmskps %xmm5, %eax ; SSE-NEXT: testb $1, %al ; SSE-NEXT: je .LBB1_2 ; SSE-NEXT: # %bb.1: # %cond.load -; SSE-NEXT: movq %xmm4, %rax -; SSE-NEXT: movss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE-NEXT: blendps {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: movq %xmm4, %rcx +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE-NEXT: .LBB1_2: # %else ; SSE-NEXT: psllq $2, %xmm0 -; SSE-NEXT: pextrb $4, %xmm5, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB1_4 ; SSE-NEXT: # %bb.3: # %cond.load1 -; SSE-NEXT: pextrq $1, %xmm4, %rax +; SSE-NEXT: pextrq $1, %xmm4, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SSE-NEXT: .LBB1_4: # %else2 ; SSE-NEXT: paddq %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pextrb $8, %xmm1, %eax -; SSE-NEXT: testb $1, %al -; SSE-NEXT: je .LBB1_6 -; SSE-NEXT: # %bb.5: # %cond.load4 -; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne .LBB1_5 +; SSE-NEXT: # %bb.6: # %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne .LBB1_7 +; SSE-NEXT: .LBB1_8: # %else8 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB1_5: # %cond.load4 +; SSE-NEXT: movq %xmm3, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; SSE-NEXT: .LBB1_6: # %else5 -; SSE-NEXT: pextrb $12, %xmm1, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB1_8 -; SSE-NEXT: # %bb.7: # %cond.load7 +; SSE-NEXT: .LBB1_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm3, %rax ; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; SSE-NEXT: .LBB1_8: # %else8 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -205,39 +207,39 @@ ; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB1_2 ; AVX1-NEXT: # %bb.1: # %cond.load -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB1_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB1_4: # %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB1_6 -; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: jne .LBB1_5 +; AVX1-NEXT: # %bb.6: # %else5 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB1_7 +; AVX1-NEXT: .LBB1_8: # %else8 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB1_5: # %cond.load4 +; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1-NEXT: .LBB1_6: # %else5 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 -; AVX1-NEXT: # %bb.7: # %cond.load7 +; AVX1-NEXT: .LBB1_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX1-NEXT: .LBB1_8: # %else8 ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -250,39 +252,39 @@ ; AVX2-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB1_2 ; AVX2-NEXT: # %bb.1: # %cond.load -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB1_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB1_4: # %else2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB1_6 -; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: jne .LBB1_5 +; AVX2-NEXT: # %bb.6: # %else5 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB1_7 +; AVX2-NEXT: .LBB1_8: # %else8 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB1_5: # %cond.load4 +; AVX2-NEXT: vmovq %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX2-NEXT: .LBB1_6: # %else5 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 -; AVX2-NEXT: # %bb.7: # %cond.load7 +; AVX2-NEXT: .LBB1_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX2-NEXT: .LBB1_8: # %else8 ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -317,39 +319,38 @@ ; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE-NEXT: pextrb $0, %xmm5, %eax +; SSE-NEXT: movmskps %xmm5, %eax ; SSE-NEXT: testb $1, %al ; SSE-NEXT: je .LBB2_2 ; SSE-NEXT: # %bb.1: # %cond.load -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: movss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE-NEXT: blendps {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3] +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; SSE-NEXT: .LBB2_2: # %else ; SSE-NEXT: psllq $2, %xmm1 -; SSE-NEXT: pextrb $4, %xmm5, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB2_4 ; SSE-NEXT: # %bb.3: # %cond.load1 -; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE-NEXT: .LBB2_4: # %else2 ; SSE-NEXT: paddq %xmm1, %xmm4 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pextrb $8, %xmm2, %eax -; SSE-NEXT: testb $1, %al -; SSE-NEXT: je .LBB2_6 -; SSE-NEXT: # %bb.5: # %cond.load4 -; SSE-NEXT: movq %xmm4, %rax +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne .LBB2_5 +; SSE-NEXT: # %bb.6: # %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne .LBB2_7 +; SSE-NEXT: .LBB2_8: # %else8 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB2_5: # %cond.load4 +; SSE-NEXT: movq %xmm4, %rcx ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; SSE-NEXT: .LBB2_6: # %else5 -; SSE-NEXT: pextrb $12, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB2_8 -; SSE-NEXT: # %bb.7: # %cond.load7 +; SSE-NEXT: .LBB2_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm4, %rax ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; SSE-NEXT: .LBB2_8: # %else8 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; @@ -364,39 +365,39 @@ ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_2 ; AVX1-NEXT: # %bb.1: # %cond.load -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB2_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB2_4: # %else2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB2_6 -; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: jne .LBB2_5 +; AVX1-NEXT: # %bb.6: # %else5 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB2_7 +; AVX1-NEXT: .LBB2_8: # %else8 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB2_5: # %cond.load4 +; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1-NEXT: .LBB2_6: # %else5 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 -; AVX1-NEXT: # %bb.7: # %cond.load7 +; AVX1-NEXT: .LBB2_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX1-NEXT: .LBB2_8: # %else8 ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -408,39 +409,39 @@ ; AVX2-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_2 ; AVX2-NEXT: # %bb.1: # %cond.load -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB2_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB2_4: # %else2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB2_6 -; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: jne .LBB2_5 +; AVX2-NEXT: # %bb.6: # %else5 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB2_7 +; AVX2-NEXT: .LBB2_8: # %else8 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB2_5: # %cond.load4 +; AVX2-NEXT: vmovq %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX2-NEXT: .LBB2_6: # %else5 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 -; AVX2-NEXT: # %bb.7: # %cond.load7 +; AVX2-NEXT: .LBB2_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX2-NEXT: .LBB2_8: # %else8 ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -480,149 +481,124 @@ ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: pcmpeqb %xmm4, %xmm6 -; SSE-NEXT: pextrb $0, %xmm6, %eax +; SSE-NEXT: pmovmskb %xmm6, %eax ; SSE-NEXT: testb $1, %al ; SSE-NEXT: je .LBB3_2 ; SSE-NEXT: # %bb.1: # %cond.load -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pinsrb $0, (%rax), %xmm5 +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pinsrb $0, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_2: # %else -; SSE-NEXT: pmovsxdq %xmm7, %xmm7 -; SSE-NEXT: pextrb $1, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: pmovsxdq %xmm7, %xmm4 +; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB3_4 ; SSE-NEXT: # %bb.3: # %cond.load1 -; SSE-NEXT: pextrq $1, %xmm0, %rax -; SSE-NEXT: pinsrb $1, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pinsrb $1, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_4: # %else2 -; SSE-NEXT: paddq %xmm8, %xmm7 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpeqb %xmm4, %xmm6 -; SSE-NEXT: pextrb $2, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: paddq %xmm8, %xmm4 +; SSE-NEXT: testb $4, %al ; SSE-NEXT: je .LBB3_6 ; SSE-NEXT: # %bb.5: # %cond.load4 -; SSE-NEXT: movq %xmm7, %rax -; SSE-NEXT: pinsrb $2, (%rax), %xmm5 +; SSE-NEXT: movq %xmm4, %rcx +; SSE-NEXT: pinsrb $2, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_6: # %else5 ; SSE-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE-NEXT: pextrb $3, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB3_8 ; SSE-NEXT: # %bb.7: # %cond.load7 -; SSE-NEXT: pextrq $1, %xmm7, %rax -; SSE-NEXT: pinsrb $3, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm4, %rcx +; SSE-NEXT: pinsrb $3, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_8: # %else8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpeqb %xmm4, %xmm6 -; SSE-NEXT: pextrb $4, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $16, %al ; SSE-NEXT: je .LBB3_10 ; SSE-NEXT: # %bb.9: # %cond.load10 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pinsrb $4, (%rax), %xmm5 +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pinsrb $4, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_10: # %else11 ; SSE-NEXT: pmovsxdq %xmm1, %xmm1 -; SSE-NEXT: pextrb $5, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $32, %al ; SSE-NEXT: je .LBB3_12 ; SSE-NEXT: # %bb.11: # %cond.load13 -; SSE-NEXT: pextrq $1, %xmm0, %rax -; SSE-NEXT: pinsrb $5, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pinsrb $5, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_12: # %else14 ; SSE-NEXT: paddq %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpeqb %xmm4, %xmm6 -; SSE-NEXT: pextrb $6, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $64, %al ; SSE-NEXT: je .LBB3_14 ; SSE-NEXT: # %bb.13: # %cond.load16 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: pinsrb $6, (%rax), %xmm5 +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: pinsrb $6, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_14: # %else17 ; SSE-NEXT: pmovsxdq %xmm2, %xmm0 -; SSE-NEXT: pextrb $7, %xmm6, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testb $-128, %al ; SSE-NEXT: je .LBB3_16 ; SSE-NEXT: # %bb.15: # %cond.load19 -; SSE-NEXT: pextrq $1, %xmm1, %rax -; SSE-NEXT: pinsrb $7, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: pinsrb $7, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_16: # %else20 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] ; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: pextrb $8, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $256, %eax # imm = 0x100 ; SSE-NEXT: je .LBB3_18 ; SSE-NEXT: # %bb.17: # %cond.load22 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pinsrb $8, (%rax), %xmm5 +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pinsrb $8, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_18: # %else23 ; SSE-NEXT: pmovsxdq %xmm1, %xmm1 -; SSE-NEXT: pextrb $9, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $512, %eax # imm = 0x200 ; SSE-NEXT: je .LBB3_20 ; SSE-NEXT: # %bb.19: # %cond.load25 -; SSE-NEXT: pextrq $1, %xmm0, %rax -; SSE-NEXT: pinsrb $9, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pinsrb $9, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_20: # %else26 ; SSE-NEXT: paddq %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: pextrb $10, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $1024, %eax # imm = 0x400 ; SSE-NEXT: je .LBB3_22 ; SSE-NEXT: # %bb.21: # %cond.load28 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: pinsrb $10, (%rax), %xmm5 +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: pinsrb $10, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_22: # %else29 ; SSE-NEXT: pmovsxdq %xmm3, %xmm0 -; SSE-NEXT: pextrb $11, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $2048, %eax # imm = 0x800 ; SSE-NEXT: je .LBB3_24 ; SSE-NEXT: # %bb.23: # %cond.load31 -; SSE-NEXT: pextrq $1, %xmm1, %rax -; SSE-NEXT: pinsrb $11, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: pinsrb $11, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_24: # %else32 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: pextrb $12, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE-NEXT: je .LBB3_26 ; SSE-NEXT: # %bb.25: # %cond.load34 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pinsrb $12, (%rax), %xmm5 +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pinsrb $12, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_26: # %else35 ; SSE-NEXT: pmovsxdq %xmm1, %xmm1 -; SSE-NEXT: pextrb $13, %xmm2, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE-NEXT: je .LBB3_28 ; SSE-NEXT: # %bb.27: # %cond.load37 -; SSE-NEXT: pextrq $1, %xmm0, %rax -; SSE-NEXT: pinsrb $13, (%rax), %xmm5 +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pinsrb $13, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_28: # %else38 ; SSE-NEXT: paddq %xmm1, %xmm8 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE-NEXT: pextrb $14, %xmm4, %eax -; SSE-NEXT: testb $1, %al -; SSE-NEXT: je .LBB3_30 -; SSE-NEXT: # %bb.29: # %cond.load40 -; SSE-NEXT: movq %xmm8, %rax -; SSE-NEXT: pinsrb $14, (%rax), %xmm5 -; SSE-NEXT: .LBB3_30: # %else41 -; SSE-NEXT: pextrb $15, %xmm4, %eax -; SSE-NEXT: testb $1, %al +; SSE-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE-NEXT: jne .LBB3_29 +; SSE-NEXT: # %bb.30: # %else41 +; SSE-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE-NEXT: jne .LBB3_31 +; SSE-NEXT: .LBB3_32: # %else44 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB3_29: # %cond.load40 +; SSE-NEXT: movq %xmm8, %rcx +; SSE-NEXT: pinsrb $14, (%rcx), %xmm5 +; SSE-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE-NEXT: je .LBB3_32 -; SSE-NEXT: # %bb.31: # %cond.load43 +; SSE-NEXT: .LBB3_31: # %cond.load43 ; SSE-NEXT: pextrq $1, %xmm8, %rax ; SSE-NEXT: pinsrb $15, (%rax), %xmm5 -; SSE-NEXT: .LBB3_32: # %else44 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: retq ; @@ -638,156 +614,132 @@ ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpmovmskb %xmm2, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB3_2 ; AVX1-NEXT: # %bb.1: # %cond.load -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $0, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $0, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_2: # %else -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $1, %xmm6, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB3_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $1, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $1, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_4: # %else2 -; AVX1-NEXT: vpmovsxdq %xmm7, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7 -; AVX1-NEXT: vpextrb $2, %xmm7, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm5 +; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm2 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $2, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_6: # %else5 -; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpextrb $3, %xmm7, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB3_8 ; AVX1-NEXT: # %bb.7: # %cond.load7 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $3, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_8: # %else8 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpextrb $4, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm0 +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB3_10 ; AVX1-NEXT: # %bb.9: # %cond.load10 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $4, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $4, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_10: # %else11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm7 -; AVX1-NEXT: vpextrb $5, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6 +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB3_12 ; AVX1-NEXT: # %bb.11: # %cond.load13 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $5, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $5, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_12: # %else14 -; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7 -; AVX1-NEXT: vpextrb $6, %xmm7, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm5 +; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm2 +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_14 ; AVX1-NEXT: # %bb.13: # %cond.load16 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $6, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_14: # %else17 -; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpextrb $7, %xmm7, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB3_16 ; AVX1-NEXT: # %bb.15: # %cond.load19 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $7, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_16: # %else20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpextrb $8, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm0 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB3_18 ; AVX1-NEXT: # %bb.17: # %cond.load22 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $8, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $8, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_18: # %else23 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB3_20 ; AVX1-NEXT: # %bb.19: # %cond.load25 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $9, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $9, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_20: # %else26 -; AVX1-NEXT: vpmovsxdq %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpextrb $10, %xmm6, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_22 ; AVX1-NEXT: # %bb.21: # %cond.load28 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $10, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_22: # %else29 -; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $11, %xmm6, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB3_24 ; AVX1-NEXT: # %bb.23: # %cond.load31 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $11, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_24: # %else32 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB3_26 ; AVX1-NEXT: # %bb.25: # %cond.load34 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $12, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $12, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_26: # %else35 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB3_28 ; AVX1-NEXT: # %bb.27: # %cond.load37 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpinsrb $13, (%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vpinsrb $13, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_28: # %else38 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB3_30 -; AVX1-NEXT: # %bb.29: # %cond.load40 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3 -; AVX1-NEXT: .LBB3_30: # %else41 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB3_29 +; AVX1-NEXT: # %bb.30: # %else41 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB3_31 +; AVX1-NEXT: .LBB3_32: # %else44 +; AVX1-NEXT: vmovdqa %xmm3, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB3_29: # %cond.load40 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpinsrb $14, (%rcx), %xmm3, %xmm3 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB3_32 -; AVX1-NEXT: # %bb.31: # %cond.load43 +; AVX1-NEXT: .LBB3_31: # %cond.load43 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3 -; AVX1-NEXT: .LBB3_32: # %else44 ; AVX1-NEXT: vmovdqa %xmm3, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -799,145 +751,121 @@ ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm5 ; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpextrb $0, %xmm6, %eax +; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpmovmskb %xmm2, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB3_2 ; AVX2-NEXT: # %bb.1: # %cond.load -; AVX2-NEXT: vmovq %xmm5, %rax -; AVX2-NEXT: vpinsrb $0, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm5, %rcx +; AVX2-NEXT: vpinsrb $0, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_2: # %else -; AVX2-NEXT: vpextrb $1, %xmm6, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB3_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 -; AVX2-NEXT: vpextrq $1, %xmm5, %rax -; AVX2-NEXT: vpinsrb $1, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm5, %rcx +; AVX2-NEXT: vpinsrb $1, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_4: # %else2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpextrb $2, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm0 ; AVX2-NEXT: je .LBB3_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vmovq %xmm5, %rax -; AVX2-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $2, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_6: # %else5 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpextrb $3, %xmm6, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB3_8 ; AVX2-NEXT: # %bb.7: # %cond.load7 -; AVX2-NEXT: vpextrq $1, %xmm5, %rax -; AVX2-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $3, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_8: # %else8 -; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpextrb $4, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB3_10 ; AVX2-NEXT: # %bb.9: # %cond.load10 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $4, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $4, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_10: # %else11 -; AVX2-NEXT: vpextrb $5, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB3_12 ; AVX2-NEXT: # %bb.11: # %cond.load13 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpinsrb $5, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $5, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_12: # %else14 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpextrb $6, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB3_14 ; AVX2-NEXT: # %bb.13: # %cond.load16 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $6, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_14: # %else17 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm6 -; AVX2-NEXT: vpextrb $7, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2 +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB3_16 ; AVX2-NEXT: # %bb.15: # %cond.load19 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $7, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_16: # %else20 -; AVX2-NEXT: vpaddq %ymm6, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpextrb $8, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB3_18 ; AVX2-NEXT: # %bb.17: # %cond.load22 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $8, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $8, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_18: # %else23 -; AVX2-NEXT: vpextrb $9, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB3_20 ; AVX2-NEXT: # %bb.19: # %cond.load25 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpinsrb $9, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $9, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_20: # %else26 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpextrb $10, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB3_22 ; AVX2-NEXT: # %bb.21: # %cond.load28 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $10, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_22: # %else29 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpextrb $11, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB3_24 ; AVX2-NEXT: # %bb.23: # %cond.load31 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $11, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_24: # %else32 ; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB3_26 ; AVX2-NEXT: # %bb.25: # %cond.load34 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $12, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $12, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_26: # %else35 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB3_28 ; AVX2-NEXT: # %bb.27: # %cond.load37 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpinsrb $13, (%rax), %xmm3, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpinsrb $13, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_28: # %else38 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB3_30 -; AVX2-NEXT: # %bb.29: # %cond.load40 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3 -; AVX2-NEXT: .LBB3_30: # %else41 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB3_29 +; AVX2-NEXT: # %bb.30: # %else41 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB3_31 +; AVX2-NEXT: .LBB3_32: # %else44 +; AVX2-NEXT: vmovdqa %xmm3, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB3_29: # %cond.load40 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpinsrb $14, (%rcx), %xmm3, %xmm3 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB3_32 -; AVX2-NEXT: # %bb.31: # %cond.load43 +; AVX2-NEXT: .LBB3_31: # %cond.load43 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3 -; AVX2-NEXT: .LBB3_32: # %else44 ; AVX2-NEXT: vmovdqa %xmm3, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -948,175 +876,134 @@ ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm4 ; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm4 ; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm5 -; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 -; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k0 -; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 +; AVX512-NEXT: vpmovmskb %xmm1, %eax ; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_2 -; AVX512-NEXT: # %bb.1: # %cond.load -; AVX512-NEXT: vmovq %xmm4, %rax -; AVX512-NEXT: vpinsrb $0, (%rax), %xmm2, %xmm2 -; AVX512-NEXT: .LBB3_2: # %else -; AVX512-NEXT: kshiftrw $1, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_4 -; AVX512-NEXT: # %bb.3: # %cond.load1 -; AVX512-NEXT: vpextrq $1, %xmm4, %rax -; AVX512-NEXT: vpinsrb $1, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: jne .LBB3_1 +; AVX512-NEXT: # %bb.2: # %else +; AVX512-NEXT: testb $2, %al +; AVX512-NEXT: jne .LBB3_3 ; AVX512-NEXT: .LBB3_4: # %else2 -; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm5 -; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 -; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_6 -; AVX512-NEXT: # %bb.5: # %cond.load4 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %rax -; AVX512-NEXT: vpinsrb $2, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: testb $4, %al +; AVX512-NEXT: jne .LBB3_5 ; AVX512-NEXT: .LBB3_6: # %else5 -; AVX512-NEXT: kshiftrw $3, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: testb $8, %al ; AVX512-NEXT: je .LBB3_8 -; AVX512-NEXT: # %bb.7: # %cond.load7 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpinsrb $3, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: .LBB3_7: # %cond.load7 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512-NEXT: vpinsrb $3, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_8: # %else8 -; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm5 -; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 -; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512-NEXT: testb $16, %al +; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm1 ; AVX512-NEXT: je .LBB3_10 ; AVX512-NEXT: # %bb.9: # %cond.load10 -; AVX512-NEXT: vmovq %xmm5, %rax -; AVX512-NEXT: vpinsrb $4, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vpinsrb $4, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_10: # %else11 -; AVX512-NEXT: kshiftrw $5, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: testb $32, %al ; AVX512-NEXT: je .LBB3_12 ; AVX512-NEXT: # %bb.11: # %cond.load13 -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpinsrb $5, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512-NEXT: vpinsrb $5, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_12: # %else14 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm5 -; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 -; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k0 -; AVX512-NEXT: kshiftrw $6, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm0 ; AVX512-NEXT: je .LBB3_14 ; AVX512-NEXT: # %bb.13: # %cond.load16 -; AVX512-NEXT: vmovq %xmm4, %rax -; AVX512-NEXT: vpinsrb $6, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vpinsrb $6, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_14: # %else17 -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: kshiftrw $7, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: testb $-128, %al ; AVX512-NEXT: je .LBB3_16 ; AVX512-NEXT: # %bb.15: # %cond.load19 -; AVX512-NEXT: vpextrq $1, %xmm4, %rax -; AVX512-NEXT: vpinsrb $7, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpinsrb $7, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_16: # %else20 -; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3 -; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_18 -; AVX512-NEXT: # %bb.17: # %cond.load22 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vpinsrb $8, (%rax), %xmm2, %xmm2 -; AVX512-NEXT: .LBB3_18: # %else23 -; AVX512-NEXT: kshiftrw $9, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_20 -; AVX512-NEXT: # %bb.19: # %cond.load25 -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpinsrb $9, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vpaddq %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: testl $256, %eax # imm = 0x100 +; AVX512-NEXT: jne .LBB3_17 +; AVX512-NEXT: # %bb.18: # %else23 +; AVX512-NEXT: testl $512, %eax # imm = 0x200 +; AVX512-NEXT: jne .LBB3_19 ; AVX512-NEXT: .LBB3_20: # %else26 -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3 -; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kshiftrw $10, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB3_22 -; AVX512-NEXT: # %bb.21: # %cond.load28 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: vpinsrb $10, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512-NEXT: jne .LBB3_21 ; AVX512-NEXT: .LBB3_22: # %else29 -; AVX512-NEXT: kshiftrw $11, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512-NEXT: je .LBB3_24 -; AVX512-NEXT: # %bb.23: # %cond.load31 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: vpinsrb $11, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: .LBB3_23: # %cond.load31 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512-NEXT: vpinsrb $11, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_24: # %else32 -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3 -; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512-NEXT: je .LBB3_26 ; AVX512-NEXT: # %bb.25: # %cond.load34 -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: vpinsrb $12, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vpinsrb $12, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_26: # %else35 -; AVX512-NEXT: kshiftrw $13, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512-NEXT: je .LBB3_28 ; AVX512-NEXT: # %bb.27: # %cond.load37 -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: vpinsrb $13, (%rax), %xmm2, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512-NEXT: vpinsrb $13, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_28: # %else38 -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k1 -; AVX512-NEXT: kmovw %k1, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: je .LBB3_30 -; AVX512-NEXT: # %bb.29: # %cond.load40 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vpinsrb $14, (%rax), %xmm2, %xmm2 -; AVX512-NEXT: .LBB3_30: # %else41 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 -; AVX512-NEXT: kmovw %k0, %eax -; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: jne .LBB3_29 +; AVX512-NEXT: # %bb.30: # %else41 +; AVX512-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512-NEXT: jne .LBB3_31 +; AVX512-NEXT: .LBB3_32: # %else44 +; AVX512-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; AVX512-NEXT: .LBB3_1: # %cond.load +; AVX512-NEXT: vmovq %xmm4, %rcx +; AVX512-NEXT: vpinsrb $0, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testb $2, %al +; AVX512-NEXT: je .LBB3_4 +; AVX512-NEXT: .LBB3_3: # %cond.load1 +; AVX512-NEXT: vpextrq $1, %xmm4, %rcx +; AVX512-NEXT: vpinsrb $1, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testb $4, %al +; AVX512-NEXT: je .LBB3_6 +; AVX512-NEXT: .LBB3_5: # %cond.load4 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vpinsrb $2, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testb $8, %al +; AVX512-NEXT: jne .LBB3_7 +; AVX512-NEXT: jmp .LBB3_8 +; AVX512-NEXT: .LBB3_17: # %cond.load22 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vpinsrb $8, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testl $512, %eax # imm = 0x200 +; AVX512-NEXT: je .LBB3_20 +; AVX512-NEXT: .LBB3_19: # %cond.load25 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpinsrb $9, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512-NEXT: je .LBB3_22 +; AVX512-NEXT: .LBB3_21: # %cond.load28 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vpinsrb $10, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512-NEXT: jne .LBB3_23 +; AVX512-NEXT: jmp .LBB3_24 +; AVX512-NEXT: .LBB3_29: # %cond.load40 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vpinsrb $14, (%rcx), %xmm2, %xmm2 +; AVX512-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512-NEXT: je .LBB3_32 -; AVX512-NEXT: # %bb.31: # %cond.load43 +; AVX512-NEXT: .LBB3_31: # %cond.load43 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vpinsrb $15, (%rax), %xmm2, %xmm2 -; AVX512-NEXT: .LBB3_32: # %else44 ; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll @@ -69,8 +69,9 @@ ; SCALAR-NEXT: br label %else ; SCALAR: else: ; SCALAR-NEXT: %res.phi.else = phi -; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i64 1 -; SCALAR-NEXT: br i1 %Mask1, label %cond.load1, label %else2 +; SCALAR-NEXT: and i16 %{{.*}}, 2 +; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 +; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { ; KNL_64-LABEL: test2: @@ -211,16 +212,18 @@ ; SCALAR-LABEL: test5 -; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i64 0 -; SCALAR-NEXT: br i1 %Mask0, label %cond.store, label %else +; SCALAR: and i16 %scalar_mask, 1 +; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 +; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else ; SCALAR: cond.store: ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 ; SCALAR-NEXT: br label %else ; SCALAR: else: -; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i64 1 -; SCALAR-NEXT: br i1 %Mask1, label %cond.store1, label %else2 +; SCALAR-NEXT: and i16 %scalar_mask, 2 +; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 +; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { ; KNL_64-LABEL: test5: @@ -1660,33 +1663,47 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kshiftrw $1, %k0, %k1 +; KNL_64-NEXT: kshiftrw $2, %k0, %k2 ; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: andb $1, %al +; KNL_64-NEXT: kmovw %k1, %ecx +; KNL_64-NEXT: andb $1, %cl +; KNL_64-NEXT: addb %cl, %cl +; KNL_64-NEXT: orb %al, %cl +; KNL_64-NEXT: kmovw %k2, %eax +; KNL_64-NEXT: andb $1, %al +; KNL_64-NEXT: shlb $2, %al +; KNL_64-NEXT: orb %cl, %al ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; KNL_64-NEXT: testb $1, %al -; KNL_64-NEXT: je .LBB31_2 -; KNL_64-NEXT: # %bb.1: # %cond.load -; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3 -; KNL_64-NEXT: .LBB31_2: # %else -; KNL_64-NEXT: kshiftrw $1, %k0, %k1 -; KNL_64-NEXT: kmovw %k1, %eax -; KNL_64-NEXT: testb $1, %al -; KNL_64-NEXT: je .LBB31_4 -; KNL_64-NEXT: # %bb.3: # %cond.load1 -; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3 +; KNL_64-NEXT: jne .LBB31_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB31_3 ; KNL_64-NEXT: .LBB31_4: # %else2 -; KNL_64-NEXT: kshiftrw $2, %k0, %k0 -; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: jne .LBB31_5 +; KNL_64-NEXT: .LBB31_6: # %else5 +; KNL_64-NEXT: vmovdqa %xmm3, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB31_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB31_4 +; KNL_64-NEXT: .LBB31_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 +; KNL_64-NEXT: testb $4, %al ; KNL_64-NEXT: je .LBB31_6 -; KNL_64-NEXT: # %bb.5: # %cond.load4 +; KNL_64-NEXT: .LBB31_5: # %cond.load4 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL_64-NEXT: vmovq %xmm0, %rax ; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; KNL_64-NEXT: .LBB31_6: # %else5 ; KNL_64-NEXT: vmovdqa %xmm3, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1698,32 +1715,48 @@ ; KNL_32-NEXT: vmovdqa %xmm0, %xmm3 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL_32-NEXT: kshiftrw $1, %k0, %k1 +; KNL_32-NEXT: kshiftrw $2, %k0, %k2 ; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: andb $1, %al +; KNL_32-NEXT: kmovw %k1, %ecx +; KNL_32-NEXT: andb $1, %cl +; KNL_32-NEXT: addb %cl, %cl +; KNL_32-NEXT: orb %al, %cl +; KNL_32-NEXT: kmovw %k2, %eax +; KNL_32-NEXT: andb $1, %al +; KNL_32-NEXT: shlb $2, %al +; KNL_32-NEXT: orb %cl, %al ; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 ; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; KNL_32-NEXT: testb $1, %al -; KNL_32-NEXT: je .LBB31_2 -; KNL_32-NEXT: # %bb.1: # %cond.load -; KNL_32-NEXT: vmovd %xmm1, %eax -; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0 -; KNL_32-NEXT: .LBB31_2: # %else -; KNL_32-NEXT: kshiftrw $1, %k0, %k1 -; KNL_32-NEXT: kmovw %k1, %eax -; KNL_32-NEXT: testb $1, %al -; KNL_32-NEXT: je .LBB31_4 -; KNL_32-NEXT: # %bb.3: # %cond.load1 -; KNL_32-NEXT: vpextrd $1, %xmm1, %eax -; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 +; KNL_32-NEXT: jne .LBB31_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB31_3 ; KNL_32-NEXT: .LBB31_4: # %else2 -; KNL_32-NEXT: kshiftrw $2, %k0, %k0 -; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB31_5 +; KNL_32-NEXT: .LBB31_6: # %else5 +; KNL_32-NEXT: addl $12, %esp +; KNL_32-NEXT: .cfi_def_cfa_offset 4 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB31_1: # %cond.load +; KNL_32-NEXT: .cfi_def_cfa_offset 16 +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB31_4 +; KNL_32-NEXT: .LBB31_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: testb $4, %al ; KNL_32-NEXT: je .LBB31_6 -; KNL_32-NEXT: # %bb.5: # %cond.load4 +; KNL_32-NEXT: .LBB31_5: # %cond.load4 ; KNL_32-NEXT: vpextrd $2, %xmm1, %eax ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 -; KNL_32-NEXT: .LBB31_6: # %else5 ; KNL_32-NEXT: addl $12, %esp ; KNL_32-NEXT: .cfi_def_cfa_offset 4 ; KNL_32-NEXT: vzeroupper @@ -1733,33 +1766,47 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX-NEXT: vpmovd2m %xmm2, %k0 +; SKX-NEXT: kshiftrb $1, %k0, %k1 +; SKX-NEXT: kshiftrb $2, %k0, %k2 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: kmovw %k1, %ecx +; SKX-NEXT: andb $1, %cl +; SKX-NEXT: addb %cl, %cl +; SKX-NEXT: orb %al, %cl +; SKX-NEXT: kmovw %k2, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: shlb $2, %al +; SKX-NEXT: orb %cl, %al ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; SKX-NEXT: testb $1, %al -; SKX-NEXT: je .LBB31_2 -; SKX-NEXT: # %bb.1: # %cond.load -; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3 -; SKX-NEXT: .LBB31_2: # %else -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovw %k1, %eax -; SKX-NEXT: testb $1, %al -; SKX-NEXT: je .LBB31_4 -; SKX-NEXT: # %bb.3: # %cond.load1 -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3 +; SKX-NEXT: jne .LBB31_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB31_3 ; SKX-NEXT: .LBB31_4: # %else2 -; SKX-NEXT: kshiftrb $2, %k0, %k0 -; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $4, %al +; SKX-NEXT: jne .LBB31_5 +; SKX-NEXT: .LBB31_6: # %else5 +; SKX-NEXT: vmovdqa %xmm3, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; SKX-NEXT: .LBB31_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3 +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB31_4 +; SKX-NEXT: .LBB31_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rcx +; SKX-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 +; SKX-NEXT: testb $4, %al ; SKX-NEXT: je .LBB31_6 -; SKX-NEXT: # %bb.5: # %cond.load4 +; SKX-NEXT: .LBB31_5: # %cond.load4 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vmovq %xmm0, %rax ; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; SKX-NEXT: .LBB31_6: # %else5 ; SKX-NEXT: vmovdqa %xmm3, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1771,32 +1818,47 @@ ; SKX_32-NEXT: vmovdqa %xmm0, %xmm3 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0 ; SKX_32-NEXT: vpmovd2m %xmm0, %k0 +; SKX_32-NEXT: kshiftrb $1, %k0, %k1 +; SKX_32-NEXT: kshiftrb $2, %k0, %k2 ; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andb $1, %al +; SKX_32-NEXT: kmovw %k1, %ecx +; SKX_32-NEXT: andb $1, %cl +; SKX_32-NEXT: addb %cl, %cl +; SKX_32-NEXT: orb %al, %cl +; SKX_32-NEXT: kmovw %k2, %eax +; SKX_32-NEXT: andb $1, %al +; SKX_32-NEXT: shlb $2, %al +; SKX_32-NEXT: orb %cl, %al ; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; SKX_32-NEXT: testb $1, %al -; SKX_32-NEXT: je .LBB31_2 -; SKX_32-NEXT: # %bb.1: # %cond.load -; SKX_32-NEXT: vmovd %xmm1, %eax -; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0 -; SKX_32-NEXT: .LBB31_2: # %else -; SKX_32-NEXT: kshiftrb $1, %k0, %k1 -; SKX_32-NEXT: kmovw %k1, %eax -; SKX_32-NEXT: testb $1, %al -; SKX_32-NEXT: je .LBB31_4 -; SKX_32-NEXT: # %bb.3: # %cond.load1 -; SKX_32-NEXT: vpextrd $1, %xmm1, %eax -; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 +; SKX_32-NEXT: jne .LBB31_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB31_3 ; SKX_32-NEXT: .LBB31_4: # %else2 -; SKX_32-NEXT: kshiftrb $2, %k0, %k0 -; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $4, %al +; SKX_32-NEXT: jne .LBB31_5 +; SKX_32-NEXT: .LBB31_6: # %else5 +; SKX_32-NEXT: addl $12, %esp +; SKX_32-NEXT: .cfi_def_cfa_offset 4 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB31_1: # %cond.load +; SKX_32-NEXT: .cfi_def_cfa_offset 16 +; SKX_32-NEXT: vmovd %xmm1, %ecx +; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB31_4 +; SKX_32-NEXT: .LBB31_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm1, %ecx +; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: testb $4, %al ; SKX_32-NEXT: je .LBB31_6 -; SKX_32-NEXT: # %bb.5: # %cond.load4 +; SKX_32-NEXT: .LBB31_5: # %cond.load4 ; SKX_32-NEXT: vpextrd $2, %xmm1, %eax ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 -; SKX_32-NEXT: .LBB31_6: # %else5 ; SKX_32-NEXT: addl $12, %esp ; SKX_32-NEXT: .cfi_def_cfa_offset 4 ; SKX_32-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -122,20 +122,23 @@ ; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3 ; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 ; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; WIDEN_AVX2-NEXT: testb $1, %al -; WIDEN_AVX2-NEXT: je .LBB1_2 -; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store -; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax -; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB1_2: # %else -; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax ; WIDEN_AVX2-NEXT: testb $1, %al +; WIDEN_AVX2-NEXT: jne .LBB1_1 +; WIDEN_AVX2-NEXT: # %bb.2: # %else +; WIDEN_AVX2-NEXT: testb $2, %al +; WIDEN_AVX2-NEXT: jne .LBB1_3 +; WIDEN_AVX2-NEXT: .LBB1_4: # %else2 +; WIDEN_AVX2-NEXT: retq +; WIDEN_AVX2-NEXT: .LBB1_1: # %cond.store +; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx +; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rcx) +; WIDEN_AVX2-NEXT: testb $2, %al ; WIDEN_AVX2-NEXT: je .LBB1_4 -; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1 +; WIDEN_AVX2-NEXT: .LBB1_3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB1_4: # %else2 ; WIDEN_AVX2-NEXT: retq ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index: @@ -147,20 +150,23 @@ ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 ; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: je .LBB1_2 -; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB1_2: # %else -; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax ; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB1_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB1_3 +; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB1_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al ; PROMOTE_AVX2-NEXT: je .LBB1_4 -; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 +; PROMOTE_AVX2-NEXT: .LBB1_3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2 ; PROMOTE_AVX2-NEXT: retq %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) @@ -273,38 +279,44 @@ ; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data: ; WIDEN_AVX2: # %bb.0: -; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; WIDEN_AVX2-NEXT: testb $1, %al -; WIDEN_AVX2-NEXT: je .LBB3_2 -; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store -; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax -; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB3_2: # %else -; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax ; WIDEN_AVX2-NEXT: testb $1, %al +; WIDEN_AVX2-NEXT: jne .LBB3_1 +; WIDEN_AVX2-NEXT: # %bb.2: # %else +; WIDEN_AVX2-NEXT: testb $2, %al +; WIDEN_AVX2-NEXT: jne .LBB3_3 +; WIDEN_AVX2-NEXT: .LBB3_4: # %else2 +; WIDEN_AVX2-NEXT: retq +; WIDEN_AVX2-NEXT: .LBB3_1: # %cond.store +; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx +; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_AVX2-NEXT: testb $2, %al ; WIDEN_AVX2-NEXT: je .LBB3_4 -; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1 +; WIDEN_AVX2-NEXT: .LBB3_3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB3_4: # %else2 ; WIDEN_AVX2-NEXT: retq ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data: ; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: je .LBB3_2 -; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB3_2: # %else -; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax ; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB3_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB3_3 +; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB3_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al ; PROMOTE_AVX2-NEXT: je .LBB3_4 -; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 +; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 ; PROMOTE_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void @@ -425,20 +437,23 @@ ; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3 ; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 ; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; WIDEN_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; WIDEN_AVX2-NEXT: testb $1, %al -; WIDEN_AVX2-NEXT: je .LBB5_2 -; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store -; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax -; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB5_2: # %else -; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax ; WIDEN_AVX2-NEXT: testb $1, %al +; WIDEN_AVX2-NEXT: jne .LBB5_1 +; WIDEN_AVX2-NEXT: # %bb.2: # %else +; WIDEN_AVX2-NEXT: testb $2, %al +; WIDEN_AVX2-NEXT: jne .LBB5_3 +; WIDEN_AVX2-NEXT: .LBB5_4: # %else2 +; WIDEN_AVX2-NEXT: retq +; WIDEN_AVX2-NEXT: .LBB5_1: # %cond.store +; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx +; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_AVX2-NEXT: testb $2, %al ; WIDEN_AVX2-NEXT: je .LBB5_4 -; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1 +; WIDEN_AVX2-NEXT: .LBB5_3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) -; WIDEN_AVX2-NEXT: .LBB5_4: # %else2 ; WIDEN_AVX2-NEXT: retq ; ; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index: @@ -450,20 +465,23 @@ ; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 ; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 ; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; PROMOTE_AVX2-NEXT: vpextrb $0, %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: je .LBB5_2 -; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB5_2: # %else -; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax +; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 +; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax ; PROMOTE_AVX2-NEXT: testb $1, %al +; PROMOTE_AVX2-NEXT: jne .LBB5_1 +; PROMOTE_AVX2-NEXT: # %bb.2: # %else +; PROMOTE_AVX2-NEXT: testb $2, %al +; PROMOTE_AVX2-NEXT: jne .LBB5_3 +; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 +; PROMOTE_AVX2-NEXT: retq +; PROMOTE_AVX2-NEXT: .LBB5_1: # %cond.store +; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx +; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) +; PROMOTE_AVX2-NEXT: testb $2, %al ; PROMOTE_AVX2-NEXT: je .LBB5_4 -; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 +; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 ; PROMOTE_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask) Index: llvm/trunk/test/CodeGen/X86/masked_load.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_load.ll +++ llvm/trunk/test/CodeGen/X86/masked_load.ll @@ -41,18 +41,21 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB1_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load +; SSE2-NEXT: jne LBB1_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB1_3 +; SSE2-NEXT: LBB1_4: ## %else2 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB1_1: ## %cond.load ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB1_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB1_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 +; SSE2-NEXT: LBB1_3: ## %cond.load1 ; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE2-NEXT: LBB1_4: ## %else2 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -60,18 +63,21 @@ ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax +; SSE42-NEXT: movmskpd %xmm2, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB1_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB1_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB1_3 +; SSE42-NEXT: LBB1_4: ## %else2 +; SSE42-NEXT: movaps %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB1_1: ## %cond.load ; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE42-NEXT: LBB1_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB1_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB1_3: ## %cond.load1 ; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE42-NEXT: LBB1_4: ## %else2 ; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -106,73 +112,41 @@ } define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { -; SSE2-LABEL: load_v4f64_v4i32: -; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE2-NEXT: LBB2_4: ## %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE2-NEXT: LBB2_6: ## %else5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; SSE2-NEXT: LBB2_8: ## %else8 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: load_v4f64_v4i32: -; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE42-NEXT: pextrb $0, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB2_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE42-NEXT: LBB2_2: ## %else -; SSE42-NEXT: pextrb $4, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB2_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE42-NEXT: LBB2_4: ## %else2 -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB2_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE42-NEXT: LBB2_6: ## %else5 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB2_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; SSE42-NEXT: LBB2_8: ## %else8 -; SSE42-NEXT: movaps %xmm1, %xmm0 -; SSE42-NEXT: movaps %xmm2, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: load_v4f64_v4i32: +; SSE: ## %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE-NEXT: movmskps %xmm3, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB2_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB2_3 +; SSE-NEXT: LBB2_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB2_5 +; SSE-NEXT: LBB2_6: ## %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB2_8 +; SSE-NEXT: LBB2_7: ## %cond.load7 +; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE-NEXT: LBB2_8: ## %else8 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; SSE-NEXT: LBB2_1: ## %cond.load +; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB2_4 +; SSE-NEXT: LBB2_3: ## %cond.load1 +; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB2_6 +; SSE-NEXT: LBB2_5: ## %cond.load4 +; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB2_7 +; SSE-NEXT: jmp LBB2_8 ; ; AVX1-LABEL: load_v4f64_v4i32: ; AVX1: ## %bb.0: @@ -217,77 +191,42 @@ } define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %addr) { -; SSE2-LABEL: load_v4f64_v4i32_zero: -; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: je LBB3_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: LBB3_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB3_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; SSE2-NEXT: LBB3_4: ## %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB3_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB3_6: ## %else5 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB3_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE2-NEXT: LBB3_8: ## %else8 -; SSE2-NEXT: retq -; -; SSE42-LABEL: load_v4f64_v4i32_zero: -; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm3 -; SSE42-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE42-NEXT: pextrb $0, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: je LBB3_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: LBB3_2: ## %else -; SSE42-NEXT: pextrb $4, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB3_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; SSE42-NEXT: LBB3_4: ## %else2 -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB3_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE42-NEXT: LBB3_6: ## %else5 -; SSE42-NEXT: pextrb $12, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB3_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE42-NEXT: LBB3_8: ## %else8 -; SSE42-NEXT: retq +; SSE-LABEL: load_v4f64_v4i32_zero: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: jne LBB3_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB3_3 +; SSE-NEXT: LBB3_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB3_5 +; SSE-NEXT: LBB3_6: ## %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB3_7 +; SSE-NEXT: LBB3_8: ## %else8 +; SSE-NEXT: retq +; SSE-NEXT: LBB3_1: ## %cond.load +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB3_4 +; SSE-NEXT: LBB3_3: ## %cond.load1 +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB3_6 +; SSE-NEXT: LBB3_5: ## %cond.load4 +; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB3_8 +; SSE-NEXT: LBB3_7: ## %cond.load7 +; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX1-LABEL: load_v4f64_v4i32_zero: ; AVX1: ## %bb.0: @@ -332,74 +271,82 @@ ; SSE2-LABEL: load_v4f64_v4i64: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB4_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE2-NEXT: LBB4_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB4_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE2-NEXT: jne LBB4_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB4_3 ; SSE2-NEXT: LBB4_4: ## %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB4_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB4_5 ; SSE2-NEXT: LBB4_6: ## %else5 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB4_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 +; SSE2-NEXT: LBB4_7: ## %cond.load7 ; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE2-NEXT: LBB4_8: ## %else8 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB4_1: ## %cond.load +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB4_4 +; SSE2-NEXT: LBB4_3: ## %cond.load1 +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB4_6 +; SSE2-NEXT: LBB4_5: ## %cond.load4 +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB4_7 +; SSE2-NEXT: jmp LBB4_8 ; ; SSE42-LABEL: load_v4f64_v4i64: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm4 -; SSE42-NEXT: pextrb $0, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB4_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE42-NEXT: LBB4_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB4_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE42-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: movmskps %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB4_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB4_3 ; SSE42-NEXT: LBB4_4: ## %else2 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB4_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB4_5 ; SSE42-NEXT: LBB4_6: ## %else5 -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB4_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB4_7: ## %cond.load7 ; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE42-NEXT: LBB4_8: ## %else8 ; SSE42-NEXT: movaps %xmm2, %xmm0 ; SSE42-NEXT: movaps %xmm3, %xmm1 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB4_1: ## %cond.load +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB4_4 +; SSE42-NEXT: LBB4_3: ## %cond.load1 +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB4_6 +; SSE42-NEXT: LBB4_5: ## %cond.load4 +; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB4_7 +; SSE42-NEXT: jmp LBB4_8 ; ; AVX1-LABEL: load_v4f64_v4i64: ; AVX1: ## %bb.0: @@ -442,133 +389,72 @@ } define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <8 x double> %dst) { -; SSE2-LABEL: load_v8f64_v8i16: -; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB5_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE2-NEXT: LBB5_4: ## %else2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE2-NEXT: LBB5_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; SSE2-NEXT: LBB5_8: ## %else8 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] -; SSE2-NEXT: LBB5_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] -; SSE2-NEXT: LBB5_12: ## %else14 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm5, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] -; SSE2-NEXT: LBB5_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] -; SSE2-NEXT: LBB5_16: ## %else20 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm4, %xmm3 -; SSE2-NEXT: retq -; -; SSE42-LABEL: load_v8f64_v8i16: -; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $0, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE42-NEXT: LBB5_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; SSE42-NEXT: LBB5_4: ## %else2 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $4, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE42-NEXT: LBB5_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; SSE42-NEXT: LBB5_8: ## %else8 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $8, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] -; SSE42-NEXT: LBB5_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] -; SSE42-NEXT: LBB5_12: ## %else14 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm5, %xmm0 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] -; SSE42-NEXT: LBB5_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB5_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] -; SSE42-NEXT: LBB5_16: ## %else20 -; SSE42-NEXT: movaps %xmm1, %xmm0 -; SSE42-NEXT: movaps %xmm2, %xmm1 -; SSE42-NEXT: movaps %xmm3, %xmm2 -; SSE42-NEXT: movaps %xmm4, %xmm3 -; SSE42-NEXT: retq +; SSE-LABEL: load_v8f64_v8i16: +; SSE: ## %bb.0: +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpeqw %xmm0, %xmm5 +; SSE-NEXT: packsswb %xmm0, %xmm5 +; SSE-NEXT: pmovmskb %xmm5, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB5_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB5_3 +; SSE-NEXT: LBB5_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB5_5 +; SSE-NEXT: LBB5_6: ## %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB5_7 +; SSE-NEXT: LBB5_8: ## %else8 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: jne LBB5_9 +; SSE-NEXT: LBB5_10: ## %else11 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: jne LBB5_11 +; SSE-NEXT: LBB5_12: ## %else14 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: jne LBB5_13 +; SSE-NEXT: LBB5_14: ## %else17 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: je LBB5_16 +; SSE-NEXT: LBB5_15: ## %cond.load19 +; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; SSE-NEXT: LBB5_16: ## %else20 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: retq +; SSE-NEXT: LBB5_1: ## %cond.load +; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB5_4 +; SSE-NEXT: LBB5_3: ## %cond.load1 +; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB5_6 +; SSE-NEXT: LBB5_5: ## %cond.load4 +; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB5_8 +; SSE-NEXT: LBB5_7: ## %cond.load7 +; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE-NEXT: testb $16, %al +; SSE-NEXT: je LBB5_10 +; SSE-NEXT: LBB5_9: ## %cond.load10 +; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE-NEXT: testb $32, %al +; SSE-NEXT: je LBB5_12 +; SSE-NEXT: LBB5_11: ## %cond.load13 +; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] +; SSE-NEXT: testb $64, %al +; SSE-NEXT: je LBB5_14 +; SSE-NEXT: LBB5_13: ## %cond.load16 +; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: jne LBB5_15 +; SSE-NEXT: jmp LBB5_16 ; ; AVX1-LABEL: load_v8f64_v8i16: ; AVX1: ## %bb.0: @@ -638,135 +524,123 @@ ; SSE2-LABEL: load_v8f64_v8i64: ; SSE2: ## %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: movaps %xmm6, %xmm9 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] -; SSE2-NEXT: LBB6_2: ## %else -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB6_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB6_3 ; SSE2-NEXT: LBB6_4: ## %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB6_5 ; SSE2-NEXT: LBB6_6: ## %else5 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB6_7 ; SSE2-NEXT: LBB6_8: ## %else8 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB6_9 ; SSE2-NEXT: LBB6_10: ## %else11 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB6_11 ; SSE2-NEXT: LBB6_12: ## %else14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB6_13 ; SSE2-NEXT: LBB6_14: ## %else17 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB6_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB6_15: ## %cond.load19 ; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] ; SSE2-NEXT: LBB6_16: ## %else20 ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm9, %xmm2 ; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB6_1: ## %cond.load +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB6_4 +; SSE2-NEXT: LBB6_3: ## %cond.load1 +; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB6_6 +; SSE2-NEXT: LBB6_5: ## %cond.load4 +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je LBB6_8 +; SSE2-NEXT: LBB6_7: ## %cond.load7 +; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je LBB6_10 +; SSE2-NEXT: LBB6_9: ## %cond.load10 +; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je LBB6_12 +; SSE2-NEXT: LBB6_11: ## %cond.load13 +; SSE2-NEXT: movhps {{.*#+}} xmm9 = xmm9[0,1],mem[0,1] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je LBB6_14 +; SSE2-NEXT: LBB6_13: ## %cond.load16 +; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB6_15 +; SSE2-NEXT: jmp LBB6_16 ; ; SSE42-LABEL: load_v8f64_v8i64: ; SSE42: ## %bb.0: ; SSE42-NEXT: movdqa %xmm7, %xmm8 ; SSE42-NEXT: pxor %xmm7, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm7 -; SSE42-NEXT: pextrb $0, %xmm7, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] -; SSE42-NEXT: LBB6_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm7, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; SSE42-NEXT: pcmpeqq %xmm7, %xmm3 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm2 +; SSE42-NEXT: packssdw %xmm3, %xmm2 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: packssdw %xmm2, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB6_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB6_3 ; SSE42-NEXT: LBB6_4: ## %else2 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB6_5 ; SSE42-NEXT: LBB6_6: ## %else5 -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB6_7 ; SSE42-NEXT: LBB6_8: ## %else8 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB6_9 ; SSE42-NEXT: LBB6_10: ## %else11 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB6_11 ; SSE42-NEXT: LBB6_12: ## %else14 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm3 -; SSE42-NEXT: pextrb $0, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB6_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB6_13 ; SSE42-NEXT: LBB6_14: ## %else17 -; SSE42-NEXT: pextrb $8, %xmm3, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB6_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB6_15: ## %cond.load19 ; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] ; SSE42-NEXT: LBB6_16: ## %else20 ; SSE42-NEXT: movaps %xmm4, %xmm0 @@ -774,6 +648,35 @@ ; SSE42-NEXT: movaps %xmm6, %xmm2 ; SSE42-NEXT: movaps %xmm8, %xmm3 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB6_1: ## %cond.load +; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB6_4 +; SSE42-NEXT: LBB6_3: ## %cond.load1 +; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB6_6 +; SSE42-NEXT: LBB6_5: ## %cond.load4 +; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB6_8 +; SSE42-NEXT: LBB6_7: ## %cond.load7 +; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB6_10 +; SSE42-NEXT: LBB6_9: ## %cond.load10 +; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB6_12 +; SSE42-NEXT: LBB6_11: ## %cond.load13 +; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB6_14 +; SSE42-NEXT: LBB6_13: ## %cond.load16 +; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB6_15 +; SSE42-NEXT: jmp LBB6_16 ; ; AVX1-LABEL: load_v8f64_v8i64: ; AVX1: ## %bb.0: @@ -825,22 +728,25 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB7_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE2-NEXT: LBB7_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB7_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB7_3 +; SSE2-NEXT: LBB7_4: ## %else2 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB7_1: ## %cond.load +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 +; SSE2-NEXT: LBB7_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: LBB7_4: ## %else2 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -849,19 +755,22 @@ ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB7_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: LBB7_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm0, %eax +; SSE42-NEXT: movmskpd %xmm0, %eax ; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB7_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB7_3 +; SSE42-NEXT: LBB7_4: ## %else2 +; SSE42-NEXT: movaps %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB7_1: ## %cond.load +; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB7_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB7_3: ## %cond.load1 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; SSE42-NEXT: LBB7_4: ## %else2 ; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -914,47 +823,50 @@ ; SSE2-LABEL: load_v2f32_v2i32_undef: ; SSE2: ## %bb.0: ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: ## implicit-def: $xmm0 -; SSE2-NEXT: je LBB8_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load +; SSE2-NEXT: jne LBB8_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB8_3 +; SSE2-NEXT: LBB8_4: ## %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB8_1: ## %cond.load ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: LBB8_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 +; SSE2-NEXT: LBB8_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: LBB8_4: ## %else2 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v2f32_v2i32_undef: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE42-NEXT: movmskpd %xmm0, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: ## implicit-def: $xmm0 -; SSE42-NEXT: je LBB8_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB8_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB8_3 +; SSE42-NEXT: LBB8_4: ## %else2 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB8_1: ## %cond.load ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE42-NEXT: LBB8_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB8_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB8_3: ## %cond.load1 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE42-NEXT: LBB8_4: ## %else2 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v2f32_v2i32_undef: @@ -1004,40 +916,43 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB9_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE2-NEXT: LBB9_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB9_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: jne LBB9_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB9_3 ; SSE2-NEXT: LBB9_4: ## %else2 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB9_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB9_5 ; SSE2-NEXT: LBB9_6: ## %else5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB9_7 +; SSE2-NEXT: LBB9_8: ## %else8 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB9_1: ## %cond.load +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB9_4 +; SSE2-NEXT: LBB9_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB9_6 +; SSE2-NEXT: LBB9_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB9_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 +; SSE2-NEXT: LBB9_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE2-NEXT: LBB9_8: ## %else8 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1045,33 +960,36 @@ ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB9_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE42-NEXT: LBB9_2: ## %else -; SSE42-NEXT: pextrb $4, %xmm2, %eax +; SSE42-NEXT: movmskps %xmm2, %eax ; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB9_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB9_3 +; SSE42-NEXT: LBB9_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB9_5 +; SSE42-NEXT: LBB9_6: ## %else5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB9_7 +; SSE42-NEXT: LBB9_8: ## %else8 +; SSE42-NEXT: movaps %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB9_1: ## %cond.load +; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB9_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB9_3: ## %cond.load1 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; SSE42-NEXT: LBB9_4: ## %else2 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $4, %al ; SSE42-NEXT: je LBB9_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 +; SSE42-NEXT: LBB9_5: ## %cond.load4 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; SSE42-NEXT: LBB9_6: ## %else5 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB9_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB9_7: ## %cond.load7 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; SSE42-NEXT: LBB9_8: ## %else8 ; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -1108,130 +1026,148 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) { ; SSE2-LABEL: load_v8f32_v8i1_zero: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: je LBB10_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load +; SSE2-NEXT: jne LBB10_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB10_3 +; SSE2-NEXT: LBB10_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB10_5 +; SSE2-NEXT: LBB10_6: ## %else5 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB10_7 +; SSE2-NEXT: LBB10_8: ## %else8 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB10_9 +; SSE2-NEXT: LBB10_10: ## %else11 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB10_11 +; SSE2-NEXT: LBB10_12: ## %else14 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB10_13 +; SSE2-NEXT: LBB10_14: ## %else17 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB10_15 +; SSE2-NEXT: LBB10_16: ## %else20 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB10_1: ## %cond.load ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: LBB10_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB10_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: LBB10_4: ## %else2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB10_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE2-NEXT: LBB10_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB10_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSE2-NEXT: LBB10_8: ## %else8 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_7: ## %cond.load7 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB10_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE2-NEXT: LBB10_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_9: ## %cond.load10 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB10_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: LBB10_12: ## %else14 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_11: ## %cond.load13 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB10_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] -; SSE2-NEXT: LBB10_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB10_13: ## %cond.load16 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB10_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB10_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE2-NEXT: LBB10_16: ## %else20 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8f32_v8i1_zero: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm0, %eax +; SSE42-NEXT: psllw $15, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: je LBB10_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB10_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB10_3 +; SSE42-NEXT: LBB10_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB10_5 +; SSE42-NEXT: LBB10_6: ## %else5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB10_7 +; SSE42-NEXT: LBB10_8: ## %else8 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB10_9 +; SSE42-NEXT: LBB10_10: ## %else11 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB10_11 +; SSE42-NEXT: LBB10_12: ## %else14 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB10_13 +; SSE42-NEXT: LBB10_14: ## %else17 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB10_15 +; SSE42-NEXT: LBB10_16: ## %else20 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB10_1: ## %cond.load ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: LBB10_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB10_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB10_3: ## %cond.load1 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE42-NEXT: LBB10_4: ## %else2 -; SSE42-NEXT: pextrb $4, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $4, %al ; SSE42-NEXT: je LBB10_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 +; SSE42-NEXT: LBB10_5: ## %cond.load4 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE42-NEXT: LBB10_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB10_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB10_7: ## %cond.load7 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; SSE42-NEXT: LBB10_8: ## %else8 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $16, %al ; SSE42-NEXT: je LBB10_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE42-NEXT: LBB10_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: LBB10_9: ## %cond.load10 +; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE42-NEXT: testb $32, %al ; SSE42-NEXT: je LBB10_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 +; SSE42-NEXT: LBB10_11: ## %cond.load13 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; SSE42-NEXT: LBB10_12: ## %else14 -; SSE42-NEXT: pextrb $12, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $64, %al ; SSE42-NEXT: je LBB10_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 +; SSE42-NEXT: LBB10_13: ## %cond.load16 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; SSE42-NEXT: LBB10_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB10_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB10_15: ## %cond.load19 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; SSE42-NEXT: LBB10_16: ## %else20 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v8f32_v8i1_zero: @@ -1285,148 +1221,154 @@ ; SSE2-LABEL: load_v8f32_v8i32: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB11_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3] -; SSE2-NEXT: LBB11_2: ## %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB11_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] -; SSE2-NEXT: movaps %xmm4, %xmm2 -; SSE2-NEXT: LBB11_4: ## %else2 -; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB11_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm2[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB11_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB11_3 +; SSE2-NEXT: LBB11_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB11_5 ; SSE2-NEXT: LBB11_6: ## %else5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB11_7 +; SSE2-NEXT: LBB11_8: ## %else8 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB11_9 +; SSE2-NEXT: LBB11_10: ## %else11 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB11_11 +; SSE2-NEXT: LBB11_12: ## %else14 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB11_13 +; SSE2-NEXT: LBB11_14: ## %else17 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: je LBB11_16 +; SSE2-NEXT: LBB11_15: ## %cond.load19 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE2-NEXT: LBB11_16: ## %else20 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB11_1: ## %cond.load +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB11_4 +; SSE2-NEXT: LBB11_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB11_6 +; SSE2-NEXT: LBB11_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB11_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 +; SSE2-NEXT: LBB11_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE2-NEXT: LBB11_8: ## %else8 -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB11_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE2-NEXT: LBB11_10: ## %else11 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB11_9: ## %cond.load10 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB11_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 +; SSE2-NEXT: LBB11_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: LBB11_12: ## %else14 -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB11_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 +; SSE2-NEXT: LBB11_13: ## %cond.load16 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE2-NEXT: LBB11_14: ## %else17 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB11_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE2-NEXT: LBB11_16: ## %else20 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: retq +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB11_15 +; SSE2-NEXT: jmp LBB11_16 ; ; SSE42-LABEL: load_v8f32_v8i32: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE42-NEXT: pextrb $0, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3] -; SSE42-NEXT: LBB11_2: ## %else -; SSE42-NEXT: pextrb $4, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; SSE42-NEXT: LBB11_4: ## %else2 -; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB11_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB11_3 +; SSE42-NEXT: LBB11_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB11_5 ; SSE42-NEXT: LBB11_6: ## %else5 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB11_7 ; SSE42-NEXT: LBB11_8: ## %else8 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB11_9 ; SSE42-NEXT: LBB11_10: ## %else11 -; SSE42-NEXT: pextrb $4, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB11_11 ; SSE42-NEXT: LBB11_12: ## %else14 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB11_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB11_13 ; SSE42-NEXT: LBB11_14: ## %else17 -; SSE42-NEXT: pextrb $12, %xmm1, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB11_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB11_15: ## %cond.load19 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; SSE42-NEXT: LBB11_16: ## %else20 ; SSE42-NEXT: movaps %xmm2, %xmm0 ; SSE42-NEXT: movaps %xmm3, %xmm1 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB11_1: ## %cond.load +; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB11_4 +; SSE42-NEXT: LBB11_3: ## %cond.load1 +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB11_6 +; SSE42-NEXT: LBB11_5: ## %cond.load4 +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB11_8 +; SSE42-NEXT: LBB11_7: ## %cond.load7 +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB11_10 +; SSE42-NEXT: LBB11_9: ## %cond.load10 +; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB11_12 +; SSE42-NEXT: LBB11_11: ## %cond.load13 +; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB11_14 +; SSE42-NEXT: LBB11_13: ## %cond.load16 +; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB11_15 +; SSE42-NEXT: jmp LBB11_16 ; ; AVX1-LABEL: load_v8f32_v8i32: ; AVX1: ## %bb.0: @@ -1507,19 +1449,22 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB13_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load +; SSE2-NEXT: jne LBB13_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB13_3 +; SSE2-NEXT: LBB13_4: ## %else2 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB13_1: ## %cond.load ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB13_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB13_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 +; SSE2-NEXT: LBB13_3: ## %cond.load1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: LBB13_4: ## %else2 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1527,18 +1472,21 @@ ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax +; SSE42-NEXT: movmskpd %xmm2, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB13_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB13_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB13_3 +; SSE42-NEXT: LBB13_4: ## %else2 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB13_1: ## %cond.load ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB13_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB13_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB13_3: ## %cond.load1 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1 -; SSE42-NEXT: LBB13_4: ## %else2 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -1584,76 +1532,84 @@ ; SSE2-LABEL: load_v4i64_v4i64: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB14_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; SSE2-NEXT: LBB14_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB14_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: jne LBB14_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB14_3 ; SSE2-NEXT: LBB14_4: ## %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB14_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB14_5 ; SSE2-NEXT: LBB14_6: ## %else5 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB14_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 +; SSE2-NEXT: LBB14_7: ## %cond.load7 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE2-NEXT: LBB14_8: ## %else8 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB14_1: ## %cond.load +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB14_4 +; SSE2-NEXT: LBB14_3: ## %cond.load1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB14_6 +; SSE2-NEXT: LBB14_5: ## %cond.load4 +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB14_7 +; SSE2-NEXT: jmp LBB14_8 ; ; SSE42-LABEL: load_v4i64_v4i64: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm4 -; SSE42-NEXT: pextrb $0, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB14_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2 -; SSE42-NEXT: LBB14_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB14_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2 +; SSE42-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: movmskps %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB14_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB14_3 ; SSE42-NEXT: LBB14_4: ## %else2 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB14_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB14_5 ; SSE42-NEXT: LBB14_6: ## %else5 -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB14_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB14_7: ## %cond.load7 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm3 ; SSE42-NEXT: LBB14_8: ## %else8 ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: movdqa %xmm3, %xmm1 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB14_1: ## %cond.load +; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2 +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB14_4 +; SSE42-NEXT: LBB14_3: ## %cond.load1 +; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB14_6 +; SSE42-NEXT: LBB14_5: ## %cond.load4 +; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB14_7 +; SSE42-NEXT: jmp LBB14_8 ; ; AVX1-LABEL: load_v4i64_v4i64: ; AVX1: ## %bb.0: @@ -1700,61 +1656,32 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: LBB15_2: ## %else -; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: packsswb %xmm0, %xmm5 +; SSE2-NEXT: pmovmskb %xmm5, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: jne LBB15_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB15_3 ; SSE2-NEXT: LBB15_4: ## %else2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB15_5 ; SSE2-NEXT: LBB15_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB15_7 ; SSE2-NEXT: LBB15_8: ## %else8 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB15_9 ; SSE2-NEXT: LBB15_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB15_11 ; SSE2-NEXT: LBB15_12: ## %else14 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqw %xmm5, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB15_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB15_13 ; SSE2-NEXT: LBB15_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB15_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB15_15: ## %cond.load19 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE2-NEXT: LBB15_16: ## %else20 @@ -1763,63 +1690,69 @@ ; SSE2-NEXT: movaps %xmm3, %xmm2 ; SSE2-NEXT: movaps %xmm4, %xmm3 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB15_1: ## %cond.load +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB15_4 +; SSE2-NEXT: LBB15_3: ## %cond.load1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB15_6 +; SSE2-NEXT: LBB15_5: ## %cond.load4 +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je LBB15_8 +; SSE2-NEXT: LBB15_7: ## %cond.load7 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je LBB15_10 +; SSE2-NEXT: LBB15_9: ## %cond.load10 +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je LBB15_12 +; SSE2-NEXT: LBB15_11: ## %cond.load13 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je LBB15_14 +; SSE2-NEXT: LBB15_13: ## %cond.load16 +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB15_15 +; SSE2-NEXT: jmp LBB15_16 ; ; SSE42-LABEL: load_v8i64_v8i16: ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm5, %xmm5 ; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $0, %xmm5, %eax +; SSE42-NEXT: packsswb %xmm0, %xmm5 +; SSE42-NEXT: pmovmskb %xmm5, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB15_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1 +; SSE42-NEXT: jne LBB15_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB15_3 ; SSE42-NEXT: LBB15_4: ## %else2 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $4, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB15_5 ; SSE42-NEXT: LBB15_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB15_7 ; SSE42-NEXT: LBB15_8: ## %else8 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 -; SSE42-NEXT: pextrb $8, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB15_9 ; SSE42-NEXT: LBB15_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm5, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB15_11 ; SSE42-NEXT: LBB15_12: ## %else14 -; SSE42-NEXT: pxor %xmm5, %xmm5 -; SSE42-NEXT: pcmpeqw %xmm5, %xmm0 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB15_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB15_13 ; SSE42-NEXT: LBB15_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB15_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB15_15: ## %cond.load19 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm4 ; SSE42-NEXT: LBB15_16: ## %else20 ; SSE42-NEXT: movdqa %xmm1, %xmm0 @@ -1827,6 +1760,35 @@ ; SSE42-NEXT: movdqa %xmm3, %xmm2 ; SSE42-NEXT: movdqa %xmm4, %xmm3 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB15_1: ## %cond.load +; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1 +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB15_4 +; SSE42-NEXT: LBB15_3: ## %cond.load1 +; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB15_6 +; SSE42-NEXT: LBB15_5: ## %cond.load4 +; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB15_8 +; SSE42-NEXT: LBB15_7: ## %cond.load7 +; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB15_10 +; SSE42-NEXT: LBB15_9: ## %cond.load10 +; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB15_12 +; SSE42-NEXT: LBB15_11: ## %cond.load13 +; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB15_14 +; SSE42-NEXT: LBB15_13: ## %cond.load16 +; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB15_15 +; SSE42-NEXT: jmp LBB15_16 ; ; AVX1-LABEL: load_v8i64_v8i16: ; AVX1: ## %bb.0: @@ -1896,139 +1858,127 @@ ; SSE2-LABEL: load_v8i64_v8i64: ; SSE2: ## %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: movaps %xmm6, %xmm9 ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] -; SSE2-NEXT: LBB16_2: ## %else -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB16_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB16_3 ; SSE2-NEXT: LBB16_4: ## %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB16_5 ; SSE2-NEXT: LBB16_6: ## %else5 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB16_7 ; SSE2-NEXT: LBB16_8: ## %else8 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB16_9 ; SSE2-NEXT: LBB16_10: ## %else11 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB16_11 ; SSE2-NEXT: LBB16_12: ## %else14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB16_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB16_13 ; SSE2-NEXT: LBB16_14: ## %else17 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB16_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB16_15: ## %cond.load19 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE2-NEXT: LBB16_16: ## %else20 ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm9, %xmm2 ; SSE2-NEXT: movdqa %xmm8, %xmm3 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB16_1: ## %cond.load +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB16_4 +; SSE2-NEXT: LBB16_3: ## %cond.load1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB16_6 +; SSE2-NEXT: LBB16_5: ## %cond.load4 +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je LBB16_8 +; SSE2-NEXT: LBB16_7: ## %cond.load7 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je LBB16_10 +; SSE2-NEXT: LBB16_9: ## %cond.load10 +; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je LBB16_12 +; SSE2-NEXT: LBB16_11: ## %cond.load13 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je LBB16_14 +; SSE2-NEXT: LBB16_13: ## %cond.load16 +; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB16_15 +; SSE2-NEXT: jmp LBB16_16 ; ; SSE42-LABEL: load_v8i64_v8i64: ; SSE42: ## %bb.0: ; SSE42-NEXT: movdqa %xmm7, %xmm8 ; SSE42-NEXT: pxor %xmm7, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm7 -; SSE42-NEXT: pextrb $0, %xmm7, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4 -; SSE42-NEXT: LBB16_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm7, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm3 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm2 +; SSE42-NEXT: packssdw %xmm3, %xmm2 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: packssdw %xmm2, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB16_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB16_3 ; SSE42-NEXT: LBB16_4: ## %else2 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: pextrb $0, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB16_5 ; SSE42-NEXT: LBB16_6: ## %else5 -; SSE42-NEXT: pextrb $8, %xmm1, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB16_7 ; SSE42-NEXT: LBB16_8: ## %else8 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB16_9 ; SSE42-NEXT: LBB16_10: ## %else11 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB16_11 ; SSE42-NEXT: LBB16_12: ## %else14 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm3 -; SSE42-NEXT: pextrb $0, %xmm3, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB16_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB16_13 ; SSE42-NEXT: LBB16_14: ## %else17 -; SSE42-NEXT: pextrb $8, %xmm3, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB16_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB16_15: ## %cond.load19 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm8 ; SSE42-NEXT: LBB16_16: ## %else20 ; SSE42-NEXT: movdqa %xmm4, %xmm0 @@ -2036,6 +1986,35 @@ ; SSE42-NEXT: movdqa %xmm6, %xmm2 ; SSE42-NEXT: movdqa %xmm8, %xmm3 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB16_1: ## %cond.load +; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4 +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB16_4 +; SSE42-NEXT: LBB16_3: ## %cond.load1 +; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB16_6 +; SSE42-NEXT: LBB16_5: ## %cond.load4 +; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB16_8 +; SSE42-NEXT: LBB16_7: ## %cond.load7 +; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB16_10 +; SSE42-NEXT: LBB16_9: ## %cond.load10 +; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB16_12 +; SSE42-NEXT: LBB16_11: ## %cond.load13 +; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB16_14 +; SSE42-NEXT: LBB16_13: ## %cond.load16 +; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB16_15 +; SSE42-NEXT: jmp LBB16_16 ; ; AVX1-LABEL: load_v8i64_v8i64: ; AVX1: ## %bb.0: @@ -2087,22 +2066,25 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB17_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE2-NEXT: LBB17_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB17_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB17_3 +; SSE2-NEXT: LBB17_4: ## %else2 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB17_1: ## %cond.load +; SSE2-NEXT: movl (%rdi), %ecx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB17_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 +; SSE2-NEXT: LBB17_3: ## %cond.load1 ; SSE2-NEXT: movl 4(%rdi), %eax ; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: LBB17_4: ## %else2 ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -2111,20 +2093,23 @@ ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB17_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movl (%rdi), %eax -; SSE42-NEXT: pinsrq $0, %rax, %xmm1 -; SSE42-NEXT: LBB17_2: ## %else -; SSE42-NEXT: pextrb $8, %xmm0, %eax +; SSE42-NEXT: movmskpd %xmm0, %eax ; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB17_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB17_3 +; SSE42-NEXT: LBB17_4: ## %else2 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB17_1: ## %cond.load +; SSE42-NEXT: movl (%rdi), %ecx +; SSE42-NEXT: pinsrq $0, %rcx, %xmm1 +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB17_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB17_3: ## %cond.load1 ; SSE42-NEXT: movl 4(%rdi), %eax ; SSE42-NEXT: pinsrq $1, %rax, %xmm1 -; SSE42-NEXT: LBB17_4: ## %else2 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -2184,40 +2169,43 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB18_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE2-NEXT: LBB18_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB18_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: jne LBB18_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB18_3 ; SSE2-NEXT: LBB18_4: ## %else2 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB18_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB18_5 ; SSE2-NEXT: LBB18_6: ## %else5 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB18_7 +; SSE2-NEXT: LBB18_8: ## %else8 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB18_1: ## %cond.load +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB18_4 +; SSE2-NEXT: LBB18_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB18_6 +; SSE2-NEXT: LBB18_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB18_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 +; SSE2-NEXT: LBB18_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE2-NEXT: LBB18_8: ## %else8 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -2225,32 +2213,35 @@ ; SSE42: ## %bb.0: ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax +; SSE42-NEXT: movmskps %xmm2, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB18_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB18_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB18_3 +; SSE42-NEXT: LBB18_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB18_5 +; SSE42-NEXT: LBB18_6: ## %else5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB18_7 +; SSE42-NEXT: LBB18_8: ## %else8 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB18_1: ## %cond.load ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB18_2: ## %else -; SSE42-NEXT: pextrb $4, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB18_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB18_3: ## %cond.load1 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 -; SSE42-NEXT: LBB18_4: ## %else2 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $4, %al ; SSE42-NEXT: je LBB18_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 +; SSE42-NEXT: LBB18_5: ## %cond.load4 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 -; SSE42-NEXT: LBB18_6: ## %else5 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB18_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB18_7: ## %cond.load7 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 -; SSE42-NEXT: LBB18_8: ## %else8 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; @@ -2295,66 +2286,33 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { ; SSE2-LABEL: load_v8i32_v8i1: ; SSE2: ## %bb.0: -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE2-NEXT: LBB19_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB19_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB19_3 ; SSE2-NEXT: LBB19_4: ## %else2 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB19_5 ; SSE2-NEXT: LBB19_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB19_7 ; SSE2-NEXT: LBB19_8: ## %else8 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB19_9 ; SSE2-NEXT: LBB19_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB19_11 ; SSE2-NEXT: LBB19_12: ## %else14 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB19_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB19_13 ; SSE2-NEXT: LBB19_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB19_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB19_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] @@ -2362,60 +2320,113 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB19_1: ## %cond.load +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB19_4 +; SSE2-NEXT: LBB19_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB19_6 +; SSE2-NEXT: LBB19_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je LBB19_8 +; SSE2-NEXT: LBB19_7: ## %cond.load7 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je LBB19_10 +; SSE2-NEXT: LBB19_9: ## %cond.load10 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je LBB19_12 +; SSE2-NEXT: LBB19_11: ## %cond.load13 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je LBB19_14 +; SSE2-NEXT: LBB19_13: ## %cond.load16 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB19_15 +; SSE2-NEXT: jmp LBB19_16 ; ; SSE42-LABEL: load_v8i32_v8i1: ; SSE42: ## %bb.0: -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB19_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 +; SSE42-NEXT: psllw $15, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB19_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB19_3 ; SSE42-NEXT: LBB19_4: ## %else2 -; SSE42-NEXT: pextrb $4, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB19_5 ; SSE42-NEXT: LBB19_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB19_7 ; SSE42-NEXT: LBB19_8: ## %else8 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB19_9 ; SSE42-NEXT: LBB19_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB19_11 ; SSE42-NEXT: LBB19_12: ## %else14 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB19_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB19_13 ; SSE42-NEXT: LBB19_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB19_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB19_15: ## %cond.load19 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm2 ; SSE42-NEXT: LBB19_16: ## %else20 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq +; SSE42-NEXT: LBB19_1: ## %cond.load +; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1 +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB19_4 +; SSE42-NEXT: LBB19_3: ## %cond.load1 +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB19_6 +; SSE42-NEXT: LBB19_5: ## %cond.load4 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB19_8 +; SSE42-NEXT: LBB19_7: ## %cond.load7 +; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB19_10 +; SSE42-NEXT: LBB19_9: ## %cond.load10 +; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB19_12 +; SSE42-NEXT: LBB19_11: ## %cond.load13 +; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB19_14 +; SSE42-NEXT: LBB19_13: ## %cond.load16 +; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB19_15 +; SSE42-NEXT: jmp LBB19_16 ; ; AVX1-LABEL: load_v8i32_v8i1: ; AVX1: ## %bb.0: @@ -2470,129 +2481,147 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) { ; SSE2-LABEL: load_v8i32_v8i1_zero: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: je LBB20_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load +; SSE2-NEXT: jne LBB20_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB20_3 +; SSE2-NEXT: LBB20_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB20_5 +; SSE2-NEXT: LBB20_6: ## %else5 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB20_7 +; SSE2-NEXT: LBB20_8: ## %else8 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB20_9 +; SSE2-NEXT: LBB20_10: ## %else11 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB20_11 +; SSE2-NEXT: LBB20_12: ## %else14 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB20_13 +; SSE2-NEXT: LBB20_14: ## %else17 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB20_15 +; SSE2-NEXT: LBB20_16: ## %else20 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB20_1: ## %cond.load ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: LBB20_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB20_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: LBB20_4: ## %else2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_3: ## %cond.load1 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB20_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE2-NEXT: LBB20_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_5: ## %cond.load4 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB20_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSE2-NEXT: LBB20_8: ## %else8 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_7: ## %cond.load7 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB20_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE2-NEXT: LBB20_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_9: ## %cond.load10 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB20_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: LBB20_12: ## %else14 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_11: ## %cond.load13 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB20_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] -; SSE2-NEXT: LBB20_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB20_13: ## %cond.load16 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB20_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 +; SSE2-NEXT: LBB20_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE2-NEXT: LBB20_16: ## %else20 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8i32_v8i1_zero: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm0, %eax +; SSE42-NEXT: psllw $15, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: je LBB20_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB20_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB20_3 +; SSE42-NEXT: LBB20_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB20_5 +; SSE42-NEXT: LBB20_6: ## %else5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB20_7 +; SSE42-NEXT: LBB20_8: ## %else8 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB20_9 +; SSE42-NEXT: LBB20_10: ## %else11 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB20_11 +; SSE42-NEXT: LBB20_12: ## %else14 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB20_13 +; SSE42-NEXT: LBB20_14: ## %else17 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB20_15 +; SSE42-NEXT: LBB20_16: ## %else20 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB20_1: ## %cond.load ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: LBB20_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB20_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB20_3: ## %cond.load1 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: LBB20_4: ## %else2 -; SSE42-NEXT: pextrb $4, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $4, %al ; SSE42-NEXT: je LBB20_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 +; SSE42-NEXT: LBB20_5: ## %cond.load4 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: LBB20_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB20_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB20_7: ## %cond.load7 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 -; SSE42-NEXT: LBB20_8: ## %else8 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $16, %al ; SSE42-NEXT: je LBB20_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 +; SSE42-NEXT: LBB20_9: ## %cond.load10 ; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm1 -; SSE42-NEXT: LBB20_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $32, %al ; SSE42-NEXT: je LBB20_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 +; SSE42-NEXT: LBB20_11: ## %cond.load13 ; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm1 -; SSE42-NEXT: LBB20_12: ## %else14 -; SSE42-NEXT: pextrb $12, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $64, %al ; SSE42-NEXT: je LBB20_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 +; SSE42-NEXT: LBB20_13: ## %cond.load16 ; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm1 -; SSE42-NEXT: LBB20_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB20_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB20_15: ## %cond.load19 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 -; SSE42-NEXT: LBB20_16: ## %else20 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v8i32_v8i1_zero: @@ -2647,262 +2676,196 @@ ; define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> %dst) { -; SSE2-LABEL: load_v8i16_v8i16: -; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: pinsrw $0, (%rdi), %xmm1 -; SSE2-NEXT: LBB21_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: pinsrw $1, 2(%rdi), %xmm1 -; SSE2-NEXT: LBB21_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: pinsrw $2, 4(%rdi), %xmm1 -; SSE2-NEXT: LBB21_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: pinsrw $3, 6(%rdi), %xmm1 -; SSE2-NEXT: LBB21_8: ## %else8 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: pinsrw $4, 8(%rdi), %xmm1 -; SSE2-NEXT: LBB21_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: pinsrw $5, 10(%rdi), %xmm1 -; SSE2-NEXT: LBB21_12: ## %else14 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: pinsrw $6, 12(%rdi), %xmm1 -; SSE2-NEXT: LBB21_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB21_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: pinsrw $7, 14(%rdi), %xmm1 -; SSE2-NEXT: LBB21_16: ## %else20 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE42-LABEL: load_v8i16_v8i16: -; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrw $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB21_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrw $1, 2(%rdi), %xmm1 -; SSE42-NEXT: LBB21_4: ## %else2 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE42-NEXT: pextrb $4, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrw $2, 4(%rdi), %xmm1 -; SSE42-NEXT: LBB21_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrw $3, 6(%rdi), %xmm1 -; SSE42-NEXT: LBB21_8: ## %else8 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrw $4, 8(%rdi), %xmm1 -; SSE42-NEXT: LBB21_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrw $5, 10(%rdi), %xmm1 -; SSE42-NEXT: LBB21_12: ## %else14 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE42-NEXT: pextrb $12, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrw $6, 12(%rdi), %xmm1 -; SSE42-NEXT: LBB21_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm2, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB21_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: pinsrw $7, 14(%rdi), %xmm1 -; SSE42-NEXT: LBB21_16: ## %else20 -; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: retq +; SSE-LABEL: load_v8i16_v8i16: +; SSE: ## %bb.0: +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB21_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB21_3 +; SSE-NEXT: LBB21_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB21_5 +; SSE-NEXT: LBB21_6: ## %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB21_7 +; SSE-NEXT: LBB21_8: ## %else8 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: jne LBB21_9 +; SSE-NEXT: LBB21_10: ## %else11 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: jne LBB21_11 +; SSE-NEXT: LBB21_12: ## %else14 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: jne LBB21_13 +; SSE-NEXT: LBB21_14: ## %else17 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: jne LBB21_15 +; SSE-NEXT: LBB21_16: ## %else20 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: LBB21_1: ## %cond.load +; SSE-NEXT: pinsrw $0, (%rdi), %xmm1 +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB21_4 +; SSE-NEXT: LBB21_3: ## %cond.load1 +; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm1 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB21_6 +; SSE-NEXT: LBB21_5: ## %cond.load4 +; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm1 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB21_8 +; SSE-NEXT: LBB21_7: ## %cond.load7 +; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm1 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: je LBB21_10 +; SSE-NEXT: LBB21_9: ## %cond.load10 +; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm1 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: je LBB21_12 +; SSE-NEXT: LBB21_11: ## %cond.load13 +; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm1 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: je LBB21_14 +; SSE-NEXT: LBB21_13: ## %cond.load16 +; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm1 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: je LBB21_16 +; SSE-NEXT: LBB21_15: ## %cond.load19 +; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: load_v8i16_v8i16: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $0, %xmm2, %eax +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax ; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB21_2 -; AVX1OR2-NEXT: ## %bb.1: ## %cond.load +; AVX1OR2-NEXT: jne LBB21_1 +; AVX1OR2-NEXT: ## %bb.2: ## %else +; AVX1OR2-NEXT: testb $2, %al +; AVX1OR2-NEXT: jne LBB21_3 +; AVX1OR2-NEXT: LBB21_4: ## %else2 +; AVX1OR2-NEXT: testb $4, %al +; AVX1OR2-NEXT: jne LBB21_5 +; AVX1OR2-NEXT: LBB21_6: ## %else5 +; AVX1OR2-NEXT: testb $8, %al +; AVX1OR2-NEXT: jne LBB21_7 +; AVX1OR2-NEXT: LBB21_8: ## %else8 +; AVX1OR2-NEXT: testb $16, %al +; AVX1OR2-NEXT: jne LBB21_9 +; AVX1OR2-NEXT: LBB21_10: ## %else11 +; AVX1OR2-NEXT: testb $32, %al +; AVX1OR2-NEXT: jne LBB21_11 +; AVX1OR2-NEXT: LBB21_12: ## %else14 +; AVX1OR2-NEXT: testb $64, %al +; AVX1OR2-NEXT: jne LBB21_13 +; AVX1OR2-NEXT: LBB21_14: ## %else17 +; AVX1OR2-NEXT: testb $-128, %al +; AVX1OR2-NEXT: jne LBB21_15 +; AVX1OR2-NEXT: LBB21_16: ## %else20 +; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: LBB21_1: ## %cond.load ; AVX1OR2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_2: ## %else -; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $2, %al ; AVX1OR2-NEXT: je LBB21_4 -; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1 +; AVX1OR2-NEXT: LBB21_3: ## %cond.load1 ; AVX1OR2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_4: ## %else2 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $4, %al ; AVX1OR2-NEXT: je LBB21_6 -; AVX1OR2-NEXT: ## %bb.5: ## %cond.load4 +; AVX1OR2-NEXT: LBB21_5: ## %cond.load4 ; AVX1OR2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_6: ## %else5 -; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $8, %al ; AVX1OR2-NEXT: je LBB21_8 -; AVX1OR2-NEXT: ## %bb.7: ## %cond.load7 +; AVX1OR2-NEXT: LBB21_7: ## %cond.load7 ; AVX1OR2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_8: ## %else8 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $16, %al ; AVX1OR2-NEXT: je LBB21_10 -; AVX1OR2-NEXT: ## %bb.9: ## %cond.load10 +; AVX1OR2-NEXT: LBB21_9: ## %cond.load10 ; AVX1OR2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_10: ## %else11 -; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $32, %al ; AVX1OR2-NEXT: je LBB21_12 -; AVX1OR2-NEXT: ## %bb.11: ## %cond.load13 +; AVX1OR2-NEXT: LBB21_11: ## %cond.load13 ; AVX1OR2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_12: ## %else14 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 -; AVX1OR2-NEXT: vpextrb $12, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: je LBB21_14 -; AVX1OR2-NEXT: ## %bb.13: ## %cond.load16 +; AVX1OR2-NEXT: LBB21_13: ## %cond.load16 ; AVX1OR2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_14: ## %else17 -; AVX1OR2-NEXT: vpextrb $14, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $-128, %al ; AVX1OR2-NEXT: je LBB21_16 -; AVX1OR2-NEXT: ## %bb.15: ## %cond.load19 +; AVX1OR2-NEXT: LBB21_15: ## %cond.load19 ; AVX1OR2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB21_16: ## %else20 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: load_v8i16_v8i16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB21_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.load +; AVX512F-NEXT: jne LBB21_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB21_3 +; AVX512F-NEXT: LBB21_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB21_5 +; AVX512F-NEXT: LBB21_6: ## %else5 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB21_7 +; AVX512F-NEXT: LBB21_8: ## %else8 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB21_9 +; AVX512F-NEXT: LBB21_10: ## %else11 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB21_11 +; AVX512F-NEXT: LBB21_12: ## %else14 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB21_13 +; AVX512F-NEXT: LBB21_14: ## %else17 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB21_15 +; AVX512F-NEXT: LBB21_16: ## %else20 +; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB21_1: ## %cond.load ; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB21_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.load1 +; AVX512F-NEXT: LBB21_3: ## %cond.load1 ; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB21_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.load4 +; AVX512F-NEXT: LBB21_5: ## %cond.load4 ; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_6: ## %else5 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB21_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.load7 +; AVX512F-NEXT: LBB21_7: ## %cond.load7 ; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_8: ## %else8 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB21_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.load10 +; AVX512F-NEXT: LBB21_9: ## %cond.load10 ; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_10: ## %else11 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB21_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.load13 +; AVX512F-NEXT: LBB21_11: ## %cond.load13 ; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_12: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB21_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.load16 +; AVX512F-NEXT: LBB21_13: ## %cond.load16 ; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_14: ## %else17 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je LBB21_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.load19 +; AVX512F-NEXT: LBB21_15: ## %cond.load19 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB21_16: ## %else20 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2910,76 +2873,67 @@ ; AVX512VLDQ-LABEL: load_v8i16_v8i16: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 ; AVX512VLDQ-NEXT: kmovw %k0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB21_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: jne LBB21_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB21_3 +; AVX512VLDQ-NEXT: LBB21_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB21_5 +; AVX512VLDQ-NEXT: LBB21_6: ## %else5 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB21_7 +; AVX512VLDQ-NEXT: LBB21_8: ## %else8 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB21_9 +; AVX512VLDQ-NEXT: LBB21_10: ## %else11 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB21_11 +; AVX512VLDQ-NEXT: LBB21_12: ## %else14 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB21_13 +; AVX512VLDQ-NEXT: LBB21_14: ## %else17 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB21_15 +; AVX512VLDQ-NEXT: LBB21_16: ## %else20 +; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB21_1: ## %cond.load ; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_2: ## %else -; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB21_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: LBB21_3: ## %cond.load1 ; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 -; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB21_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: LBB21_5: ## %cond.load4 ; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_6: ## %else5 -; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB21_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: LBB21_7: ## %cond.load7 ; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_8: ## %else8 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 -; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB21_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: LBB21_9: ## %cond.load10 ; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_10: ## %else11 -; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB21_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: LBB21_11: ## %cond.load13 ; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_12: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB21_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: LBB21_13: ## %cond.load16 ; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_14: ## %else17 -; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al ; AVX512VLDQ-NEXT: je LBB21_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: LBB21_15: ## %cond.load19 ; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB21_16: ## %else20 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq @@ -2995,875 +2949,704 @@ } define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i16> %dst) { -; SSE2-LABEL: load_v16i16_v16i16: -; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: pinsrw $0, (%rdi), %xmm2 -; SSE2-NEXT: LBB22_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: pinsrw $1, 2(%rdi), %xmm2 -; SSE2-NEXT: LBB22_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: pinsrw $2, 4(%rdi), %xmm2 -; SSE2-NEXT: LBB22_6: ## %else5 -; SSE2-NEXT: pextrw $3, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: pinsrw $3, 6(%rdi), %xmm2 -; SSE2-NEXT: LBB22_8: ## %else8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: pinsrw $4, 8(%rdi), %xmm2 -; SSE2-NEXT: LBB22_10: ## %else11 -; SSE2-NEXT: pextrw $5, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: pinsrw $5, 10(%rdi), %xmm2 -; SSE2-NEXT: LBB22_12: ## %else14 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: pinsrw $6, 12(%rdi), %xmm2 -; SSE2-NEXT: LBB22_14: ## %else17 -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: pinsrw $7, 14(%rdi), %xmm2 -; SSE2-NEXT: LBB22_16: ## %else20 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_18 -; SSE2-NEXT: ## %bb.17: ## %cond.load22 -; SSE2-NEXT: pinsrw $0, 16(%rdi), %xmm3 -; SSE2-NEXT: LBB22_18: ## %else23 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_20 -; SSE2-NEXT: ## %bb.19: ## %cond.load25 -; SSE2-NEXT: pinsrw $1, 18(%rdi), %xmm3 -; SSE2-NEXT: LBB22_20: ## %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_22 -; SSE2-NEXT: ## %bb.21: ## %cond.load28 -; SSE2-NEXT: pinsrw $2, 20(%rdi), %xmm3 -; SSE2-NEXT: LBB22_22: ## %else29 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_24 -; SSE2-NEXT: ## %bb.23: ## %cond.load31 -; SSE2-NEXT: pinsrw $3, 22(%rdi), %xmm3 -; SSE2-NEXT: LBB22_24: ## %else32 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_26 -; SSE2-NEXT: ## %bb.25: ## %cond.load34 -; SSE2-NEXT: pinsrw $4, 24(%rdi), %xmm3 -; SSE2-NEXT: LBB22_26: ## %else35 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_28 -; SSE2-NEXT: ## %bb.27: ## %cond.load37 -; SSE2-NEXT: pinsrw $5, 26(%rdi), %xmm3 -; SSE2-NEXT: LBB22_28: ## %else38 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_30 -; SSE2-NEXT: ## %bb.29: ## %cond.load40 -; SSE2-NEXT: pinsrw $6, 28(%rdi), %xmm3 -; SSE2-NEXT: LBB22_30: ## %else41 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB22_32 -; SSE2-NEXT: ## %bb.31: ## %cond.load43 -; SSE2-NEXT: pinsrw $7, 30(%rdi), %xmm3 -; SSE2-NEXT: LBB22_32: ## %else44 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: load_v16i16_v16i16: -; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE42-NEXT: pextrb $0, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrw $0, (%rdi), %xmm2 -; SSE42-NEXT: LBB22_2: ## %else -; SSE42-NEXT: pextrb $2, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrw $1, 2(%rdi), %xmm2 -; SSE42-NEXT: LBB22_4: ## %else2 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE42-NEXT: pextrb $4, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrw $2, 4(%rdi), %xmm2 -; SSE42-NEXT: LBB22_6: ## %else5 -; SSE42-NEXT: pextrb $6, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrw $3, 6(%rdi), %xmm2 -; SSE42-NEXT: LBB22_8: ## %else8 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE42-NEXT: pextrb $8, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrw $4, 8(%rdi), %xmm2 -; SSE42-NEXT: LBB22_10: ## %else11 -; SSE42-NEXT: pextrb $10, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrw $5, 10(%rdi), %xmm2 -; SSE42-NEXT: LBB22_12: ## %else14 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtw %xmm0, %xmm4 -; SSE42-NEXT: pextrb $12, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrw $6, 12(%rdi), %xmm2 -; SSE42-NEXT: LBB22_14: ## %else17 -; SSE42-NEXT: pextrb $14, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: pinsrw $7, 14(%rdi), %xmm2 -; SSE42-NEXT: LBB22_16: ## %else20 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_18 -; SSE42-NEXT: ## %bb.17: ## %cond.load22 -; SSE42-NEXT: pinsrw $0, 16(%rdi), %xmm3 -; SSE42-NEXT: LBB22_18: ## %else23 -; SSE42-NEXT: pextrb $2, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_20 -; SSE42-NEXT: ## %bb.19: ## %cond.load25 -; SSE42-NEXT: pinsrw $1, 18(%rdi), %xmm3 -; SSE42-NEXT: LBB22_20: ## %else26 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE42-NEXT: pextrb $4, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_22 -; SSE42-NEXT: ## %bb.21: ## %cond.load28 -; SSE42-NEXT: pinsrw $2, 20(%rdi), %xmm3 -; SSE42-NEXT: LBB22_22: ## %else29 -; SSE42-NEXT: pextrb $6, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_24 -; SSE42-NEXT: ## %bb.23: ## %cond.load31 -; SSE42-NEXT: pinsrw $3, 22(%rdi), %xmm3 -; SSE42-NEXT: LBB22_24: ## %else32 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_26 -; SSE42-NEXT: ## %bb.25: ## %cond.load34 -; SSE42-NEXT: pinsrw $4, 24(%rdi), %xmm3 -; SSE42-NEXT: LBB22_26: ## %else35 -; SSE42-NEXT: pextrb $10, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_28 -; SSE42-NEXT: ## %bb.27: ## %cond.load37 -; SSE42-NEXT: pinsrw $5, 26(%rdi), %xmm3 -; SSE42-NEXT: LBB22_28: ## %else38 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_30 -; SSE42-NEXT: ## %bb.29: ## %cond.load40 -; SSE42-NEXT: pinsrw $6, 28(%rdi), %xmm3 -; SSE42-NEXT: LBB22_30: ## %else41 -; SSE42-NEXT: pextrb $14, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB22_32 -; SSE42-NEXT: ## %bb.31: ## %cond.load43 -; SSE42-NEXT: pinsrw $7, 30(%rdi), %xmm3 -; SSE42-NEXT: LBB22_32: ## %else44 -; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: load_v16i16_v16i16: +; SSE: ## %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB22_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB22_3 +; SSE-NEXT: LBB22_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB22_5 +; SSE-NEXT: LBB22_6: ## %else5 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB22_7 +; SSE-NEXT: LBB22_8: ## %else8 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: jne LBB22_9 +; SSE-NEXT: LBB22_10: ## %else11 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: jne LBB22_11 +; SSE-NEXT: LBB22_12: ## %else14 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: jne LBB22_13 +; SSE-NEXT: LBB22_14: ## %else17 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: jne LBB22_15 +; SSE-NEXT: LBB22_16: ## %else20 +; SSE-NEXT: testl $256, %eax ## imm = 0x100 +; SSE-NEXT: jne LBB22_17 +; SSE-NEXT: LBB22_18: ## %else23 +; SSE-NEXT: testl $512, %eax ## imm = 0x200 +; SSE-NEXT: jne LBB22_19 +; SSE-NEXT: LBB22_20: ## %else26 +; SSE-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE-NEXT: jne LBB22_21 +; SSE-NEXT: LBB22_22: ## %else29 +; SSE-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE-NEXT: jne LBB22_23 +; SSE-NEXT: LBB22_24: ## %else32 +; SSE-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE-NEXT: jne LBB22_25 +; SSE-NEXT: LBB22_26: ## %else35 +; SSE-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE-NEXT: jne LBB22_27 +; SSE-NEXT: LBB22_28: ## %else38 +; SSE-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE-NEXT: jne LBB22_29 +; SSE-NEXT: LBB22_30: ## %else41 +; SSE-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE-NEXT: je LBB22_32 +; SSE-NEXT: LBB22_31: ## %cond.load43 +; SSE-NEXT: pinsrw $7, 30(%rdi), %xmm3 +; SSE-NEXT: LBB22_32: ## %else44 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: retq +; SSE-NEXT: LBB22_1: ## %cond.load +; SSE-NEXT: pinsrw $0, (%rdi), %xmm2 +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB22_4 +; SSE-NEXT: LBB22_3: ## %cond.load1 +; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB22_6 +; SSE-NEXT: LBB22_5: ## %cond.load4 +; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm2 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB22_8 +; SSE-NEXT: LBB22_7: ## %cond.load7 +; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm2 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: je LBB22_10 +; SSE-NEXT: LBB22_9: ## %cond.load10 +; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm2 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: je LBB22_12 +; SSE-NEXT: LBB22_11: ## %cond.load13 +; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm2 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: je LBB22_14 +; SSE-NEXT: LBB22_13: ## %cond.load16 +; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm2 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: je LBB22_16 +; SSE-NEXT: LBB22_15: ## %cond.load19 +; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm2 +; SSE-NEXT: testl $256, %eax ## imm = 0x100 +; SSE-NEXT: je LBB22_18 +; SSE-NEXT: LBB22_17: ## %cond.load22 +; SSE-NEXT: pinsrw $0, 16(%rdi), %xmm3 +; SSE-NEXT: testl $512, %eax ## imm = 0x200 +; SSE-NEXT: je LBB22_20 +; SSE-NEXT: LBB22_19: ## %cond.load25 +; SSE-NEXT: pinsrw $1, 18(%rdi), %xmm3 +; SSE-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE-NEXT: je LBB22_22 +; SSE-NEXT: LBB22_21: ## %cond.load28 +; SSE-NEXT: pinsrw $2, 20(%rdi), %xmm3 +; SSE-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE-NEXT: je LBB22_24 +; SSE-NEXT: LBB22_23: ## %cond.load31 +; SSE-NEXT: pinsrw $3, 22(%rdi), %xmm3 +; SSE-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE-NEXT: je LBB22_26 +; SSE-NEXT: LBB22_25: ## %cond.load34 +; SSE-NEXT: pinsrw $4, 24(%rdi), %xmm3 +; SSE-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE-NEXT: je LBB22_28 +; SSE-NEXT: LBB22_27: ## %cond.load37 +; SSE-NEXT: pinsrw $5, 26(%rdi), %xmm3 +; SSE-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE-NEXT: je LBB22_30 +; SSE-NEXT: LBB22_29: ## %cond.load40 +; SSE-NEXT: pinsrw $6, 28(%rdi), %xmm3 +; SSE-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE-NEXT: jne LBB22_31 +; SSE-NEXT: jmp LBB22_32 ; ; AVX1-LABEL: load_v16i16_v16i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_2 -; AVX1-NEXT: ## %bb.1: ## %cond.load -; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: LBB22_2: ## %else -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_4 -; AVX1-NEXT: ## %bb.3: ## %cond.load1 -; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: jne LBB22_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB22_3 ; AVX1-NEXT: LBB22_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_6 -; AVX1-NEXT: ## %bb.5: ## %cond.load4 -; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB22_5 ; AVX1-NEXT: LBB22_6: ## %else5 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_8 -; AVX1-NEXT: ## %bb.7: ## %cond.load7 -; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB22_7 ; AVX1-NEXT: LBB22_8: ## %else8 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_10 -; AVX1-NEXT: ## %bb.9: ## %cond.load10 -; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne LBB22_9 ; AVX1-NEXT: LBB22_10: ## %else11 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_12 -; AVX1-NEXT: ## %bb.11: ## %cond.load13 -; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne LBB22_11 ; AVX1-NEXT: LBB22_12: ## %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_14 -; AVX1-NEXT: ## %bb.13: ## %cond.load16 -; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne LBB22_13 ; AVX1-NEXT: LBB22_14: ## %else17 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_16 -; AVX1-NEXT: ## %bb.15: ## %cond.load19 -; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne LBB22_15 ; AVX1-NEXT: LBB22_16: ## %else20 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_18 -; AVX1-NEXT: ## %bb.17: ## %cond.load22 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: jne LBB22_17 ; AVX1-NEXT: LBB22_18: ## %else23 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_20 -; AVX1-NEXT: ## %bb.19: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: jne LBB22_19 ; AVX1-NEXT: LBB22_20: ## %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_22 -; AVX1-NEXT: ## %bb.21: ## %cond.load28 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: jne LBB22_21 ; AVX1-NEXT: LBB22_22: ## %else29 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_24 -; AVX1-NEXT: ## %bb.23: ## %cond.load31 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: jne LBB22_23 ; AVX1-NEXT: LBB22_24: ## %else32 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_26 -; AVX1-NEXT: ## %bb.25: ## %cond.load34 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: jne LBB22_25 ; AVX1-NEXT: LBB22_26: ## %else35 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_28 -; AVX1-NEXT: ## %bb.27: ## %cond.load37 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: jne LBB22_27 ; AVX1-NEXT: LBB22_28: ## %else38 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB22_30 -; AVX1-NEXT: ## %bb.29: ## %cond.load40 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: jne LBB22_29 ; AVX1-NEXT: LBB22_30: ## %else41 -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: jne LBB22_31 +; AVX1-NEXT: LBB22_32: ## %else44 +; AVX1-NEXT: vmovaps %ymm1, %ymm0 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB22_1: ## %cond.load +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB22_4 +; AVX1-NEXT: LBB22_3: ## %cond.load1 +; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je LBB22_6 +; AVX1-NEXT: LBB22_5: ## %cond.load4 +; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je LBB22_8 +; AVX1-NEXT: LBB22_7: ## %cond.load7 +; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je LBB22_10 +; AVX1-NEXT: LBB22_9: ## %cond.load10 +; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je LBB22_12 +; AVX1-NEXT: LBB22_11: ## %cond.load13 +; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je LBB22_14 +; AVX1-NEXT: LBB22_13: ## %cond.load16 +; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je LBB22_16 +; AVX1-NEXT: LBB22_15: ## %cond.load19 +; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: je LBB22_18 +; AVX1-NEXT: LBB22_17: ## %cond.load22 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: je LBB22_20 +; AVX1-NEXT: LBB22_19: ## %cond.load25 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: je LBB22_22 +; AVX1-NEXT: LBB22_21: ## %cond.load28 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: je LBB22_24 +; AVX1-NEXT: LBB22_23: ## %cond.load31 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: je LBB22_26 +; AVX1-NEXT: LBB22_25: ## %cond.load34 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: je LBB22_28 +; AVX1-NEXT: LBB22_27: ## %cond.load37 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: je LBB22_30 +; AVX1-NEXT: LBB22_29: ## %cond.load40 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB22_32 -; AVX1-NEXT: ## %bb.31: ## %cond.load43 +; AVX1-NEXT: LBB22_31: ## %cond.load43 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: LBB22_32: ## %else44 ; AVX1-NEXT: vmovaps %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v16i16_v16i16: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_2 -; AVX2-NEXT: ## %bb.1: ## %cond.load -; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: LBB22_2: ## %else -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_4 -; AVX2-NEXT: ## %bb.3: ## %cond.load1 -; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne LBB22_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB22_3 ; AVX2-NEXT: LBB22_4: ## %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_6 -; AVX2-NEXT: ## %bb.5: ## %cond.load4 -; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB22_5 ; AVX2-NEXT: LBB22_6: ## %else5 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_8 -; AVX2-NEXT: ## %bb.7: ## %cond.load7 -; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB22_7 ; AVX2-NEXT: LBB22_8: ## %else8 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_10 -; AVX2-NEXT: ## %bb.9: ## %cond.load10 -; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne LBB22_9 ; AVX2-NEXT: LBB22_10: ## %else11 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_12 -; AVX2-NEXT: ## %bb.11: ## %cond.load13 -; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne LBB22_11 ; AVX2-NEXT: LBB22_12: ## %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_14 -; AVX2-NEXT: ## %bb.13: ## %cond.load16 -; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne LBB22_13 ; AVX2-NEXT: LBB22_14: ## %else17 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_16 -; AVX2-NEXT: ## %bb.15: ## %cond.load19 -; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne LBB22_15 ; AVX2-NEXT: LBB22_16: ## %else20 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_18 -; AVX2-NEXT: ## %bb.17: ## %cond.load22 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: jne LBB22_17 ; AVX2-NEXT: LBB22_18: ## %else23 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_20 -; AVX2-NEXT: ## %bb.19: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: jne LBB22_19 ; AVX2-NEXT: LBB22_20: ## %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_22 -; AVX2-NEXT: ## %bb.21: ## %cond.load28 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: jne LBB22_21 ; AVX2-NEXT: LBB22_22: ## %else29 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_24 -; AVX2-NEXT: ## %bb.23: ## %cond.load31 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: jne LBB22_23 ; AVX2-NEXT: LBB22_24: ## %else32 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_26 -; AVX2-NEXT: ## %bb.25: ## %cond.load34 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: jne LBB22_25 ; AVX2-NEXT: LBB22_26: ## %else35 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_28 -; AVX2-NEXT: ## %bb.27: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: jne LBB22_27 ; AVX2-NEXT: LBB22_28: ## %else38 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB22_30 -; AVX2-NEXT: ## %bb.29: ## %cond.load40 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: jne LBB22_29 ; AVX2-NEXT: LBB22_30: ## %else41 -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: jne LBB22_31 +; AVX2-NEXT: LBB22_32: ## %else44 +; AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB22_1: ## %cond.load +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB22_4 +; AVX2-NEXT: LBB22_3: ## %cond.load1 +; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: je LBB22_6 +; AVX2-NEXT: LBB22_5: ## %cond.load4 +; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je LBB22_8 +; AVX2-NEXT: LBB22_7: ## %cond.load7 +; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: je LBB22_10 +; AVX2-NEXT: LBB22_9: ## %cond.load10 +; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: je LBB22_12 +; AVX2-NEXT: LBB22_11: ## %cond.load13 +; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: je LBB22_14 +; AVX2-NEXT: LBB22_13: ## %cond.load16 +; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je LBB22_16 +; AVX2-NEXT: LBB22_15: ## %cond.load19 +; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: je LBB22_18 +; AVX2-NEXT: LBB22_17: ## %cond.load22 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: je LBB22_20 +; AVX2-NEXT: LBB22_19: ## %cond.load25 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: je LBB22_22 +; AVX2-NEXT: LBB22_21: ## %cond.load28 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: je LBB22_24 +; AVX2-NEXT: LBB22_23: ## %cond.load31 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: je LBB22_26 +; AVX2-NEXT: LBB22_25: ## %cond.load34 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: je LBB22_28 +; AVX2-NEXT: LBB22_27: ## %cond.load37 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: je LBB22_30 +; AVX2-NEXT: LBB22_29: ## %cond.load40 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB22_32 -; AVX2-NEXT: ## %bb.31: ## %cond.load43 +; AVX2-NEXT: LBB22_31: ## %cond.load43 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: LBB22_32: ## %else44 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v16i16_v16i16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.load -; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: LBB22_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.load1 -; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: jne LBB22_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB22_3 ; AVX512F-NEXT: LBB22_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.load4 -; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB22_5 ; AVX512F-NEXT: LBB22_6: ## %else5 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.load7 -; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB22_7 ; AVX512F-NEXT: LBB22_8: ## %else8 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.load10 -; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB22_9 ; AVX512F-NEXT: LBB22_10: ## %else11 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.load13 -; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB22_11 ; AVX512F-NEXT: LBB22_12: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.load16 -; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB22_13 ; AVX512F-NEXT: LBB22_14: ## %else17 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.load19 -; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB22_15 ; AVX512F-NEXT: LBB22_16: ## %else20 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.load22 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: jne LBB22_17 ; AVX512F-NEXT: LBB22_18: ## %else23 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.load25 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB22_19 ; AVX512F-NEXT: LBB22_20: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.load28 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB22_21 ; AVX512F-NEXT: LBB22_22: ## %else29 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.load31 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB22_23 ; AVX512F-NEXT: LBB22_24: ## %else32 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.load34 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB22_25 ; AVX512F-NEXT: LBB22_26: ## %else35 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB22_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.load37 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB22_27 ; AVX512F-NEXT: LBB22_28: ## %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB22_29 +; AVX512F-NEXT: LBB22_30: ## %else41 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB22_31 +; AVX512F-NEXT: LBB22_32: ## %else44 +; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB22_1: ## %cond.load +; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: je LBB22_4 +; AVX512F-NEXT: LBB22_3: ## %cond.load1 +; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: je LBB22_6 +; AVX512F-NEXT: LBB22_5: ## %cond.load4 +; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: je LBB22_8 +; AVX512F-NEXT: LBB22_7: ## %cond.load7 +; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: je LBB22_10 +; AVX512F-NEXT: LBB22_9: ## %cond.load10 +; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: je LBB22_12 +; AVX512F-NEXT: LBB22_11: ## %cond.load13 +; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: je LBB22_14 +; AVX512F-NEXT: LBB22_13: ## %cond.load16 +; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je LBB22_16 +; AVX512F-NEXT: LBB22_15: ## %cond.load19 +; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: je LBB22_18 +; AVX512F-NEXT: LBB22_17: ## %cond.load22 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: je LBB22_20 +; AVX512F-NEXT: LBB22_19: ## %cond.load25 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: je LBB22_22 +; AVX512F-NEXT: LBB22_21: ## %cond.load28 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: je LBB22_24 +; AVX512F-NEXT: LBB22_23: ## %cond.load31 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: je LBB22_26 +; AVX512F-NEXT: LBB22_25: ## %cond.load34 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: je LBB22_28 +; AVX512F-NEXT: LBB22_27: ## %cond.load37 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB22_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.load40 +; AVX512F-NEXT: LBB22_29: ## %cond.load40 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: LBB22_30: ## %else41 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB22_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.load43 +; AVX512F-NEXT: LBB22_31: ## %cond.load43 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: LBB22_32: ## %else44 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_v16i16_v16i16: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load -; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512VLDQ-NEXT: LBB22_2: ## %else -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512VLDQ-NEXT: kmovw %k0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 -; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: jne LBB22_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB22_3 ; AVX512VLDQ-NEXT: LBB22_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 -; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB22_5 ; AVX512VLDQ-NEXT: LBB22_6: ## %else5 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 -; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB22_7 ; AVX512VLDQ-NEXT: LBB22_8: ## %else8 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 -; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB22_9 ; AVX512VLDQ-NEXT: LBB22_10: ## %else11 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 -; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB22_11 ; AVX512VLDQ-NEXT: LBB22_12: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 -; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB22_13 ; AVX512VLDQ-NEXT: LBB22_14: ## %else17 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 -; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB22_15 ; AVX512VLDQ-NEXT: LBB22_16: ## %else20 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: jne LBB22_17 ; AVX512VLDQ-NEXT: LBB22_18: ## %else23 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB22_19 ; AVX512VLDQ-NEXT: LBB22_20: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB22_21 ; AVX512VLDQ-NEXT: LBB22_22: ## %else29 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB22_23 ; AVX512VLDQ-NEXT: LBB22_24: ## %else32 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB22_25 ; AVX512VLDQ-NEXT: LBB22_26: ## %else35 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB22_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB22_27 ; AVX512VLDQ-NEXT: LBB22_28: ## %else38 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB22_29 +; AVX512VLDQ-NEXT: LBB22_30: ## %else41 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB22_31 +; AVX512VLDQ-NEXT: LBB22_32: ## %else44 +; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB22_1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: je LBB22_4 +; AVX512VLDQ-NEXT: LBB22_3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: je LBB22_6 +; AVX512VLDQ-NEXT: LBB22_5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: je LBB22_8 +; AVX512VLDQ-NEXT: LBB22_7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: je LBB22_10 +; AVX512VLDQ-NEXT: LBB22_9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: je LBB22_12 +; AVX512VLDQ-NEXT: LBB22_11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: je LBB22_14 +; AVX512VLDQ-NEXT: LBB22_13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: je LBB22_16 +; AVX512VLDQ-NEXT: LBB22_15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: je LBB22_18 +; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: je LBB22_20 +; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: je LBB22_22 +; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: je LBB22_24 +; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: je LBB22_26 +; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: je LBB22_28 +; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB22_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 +; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512VLDQ-NEXT: LBB22_30: ## %else41 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB22_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 +; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512VLDQ-NEXT: LBB22_32: ## %else44 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -3884,195 +3667,197 @@ define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> %dst) { ; SSE2-LABEL: load_v16i8_v16i8: ; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB23_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: jne LBB23_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB23_3 +; SSE2-NEXT: LBB23_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB23_5 +; SSE2-NEXT: LBB23_6: ## %else5 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB23_7 +; SSE2-NEXT: LBB23_8: ## %else8 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB23_9 +; SSE2-NEXT: LBB23_10: ## %else11 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB23_11 +; SSE2-NEXT: LBB23_12: ## %else14 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB23_13 +; SSE2-NEXT: LBB23_14: ## %else17 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB23_15 +; SSE2-NEXT: LBB23_16: ## %else20 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: jne LBB23_17 +; SSE2-NEXT: LBB23_18: ## %else23 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 +; SSE2-NEXT: jne LBB23_19 +; SSE2-NEXT: LBB23_20: ## %else26 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: jne LBB23_21 +; SSE2-NEXT: LBB23_22: ## %else29 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE2-NEXT: jne LBB23_23 +; SSE2-NEXT: LBB23_24: ## %else32 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: jne LBB23_25 +; SSE2-NEXT: LBB23_26: ## %else35 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE2-NEXT: jne LBB23_27 +; SSE2-NEXT: LBB23_28: ## %else38 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: jne LBB23_29 +; SSE2-NEXT: LBB23_30: ## %else41 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: jne LBB23_31 +; SSE2-NEXT: LBB23_32: ## %else44 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB23_1: ## %cond.load +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl (%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_2: ## %else -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB23_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 1(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: psllw $8, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: LBB23_3: ## %cond.load1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 1(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: psllw $8, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB23_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_5: ## %cond.load4 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 2(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_6: ## %else5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB23_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pslld $24, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_8: ## %else8 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_7: ## %cond.load7 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslld $24, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB23_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_9: ## %cond.load10 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 4(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_10: ## %else11 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB23_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 5(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: psllq $40, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_12: ## %else14 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_11: ## %cond.load13 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 5(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: psllq $40, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB23_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_13: ## %cond.load16 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 6(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: psllq $48, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_14: ## %else17 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: psllq $48, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB23_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 7(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: psllq $56, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_16: ## %else20 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_15: ## %cond.load19 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 7(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: psllq $56, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB23_18 -; SSE2-NEXT: ## %bb.17: ## %cond.load22 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_17: ## %cond.load22 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 8(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_18: ## %else23 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 ; SSE2-NEXT: je LBB23_20 -; SSE2-NEXT: ## %bb.19: ## %cond.load25 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 9(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_20: ## %else26 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_19: ## %cond.load25 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 9(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB23_22 -; SSE2-NEXT: ## %bb.21: ## %cond.load28 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_21: ## %cond.load28 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 10(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_22: ## %else29 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB23_24 -; SSE2-NEXT: ## %bb.23: ## %cond.load31 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 11(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_24: ## %else32 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_23: ## %cond.load31 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 11(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB23_26 -; SSE2-NEXT: ## %bb.25: ## %cond.load34 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: LBB23_25: ## %cond.load34 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 12(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_26: ## %else35 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE2-NEXT: je LBB23_28 -; SSE2-NEXT: ## %bb.27: ## %cond.load37 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movzbl 13(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: LBB23_28: ## %else38 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB23_27: ## %cond.load37 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movzbl 13(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB23_30 -; SSE2-NEXT: ## %bb.29: ## %cond.load40 +; SSE2-NEXT: LBB23_29: ## %cond.load40 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movzbl 14(%rdi), %ecx @@ -4080,550 +3865,487 @@ ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: LBB23_30: ## %else41 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE2-NEXT: je LBB23_32 -; SSE2-NEXT: ## %bb.31: ## %cond.load43 +; SSE2-NEXT: LBB23_31: ## %cond.load43 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movzbl 15(%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: LBB23_32: ## %else44 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v16i8_v16i8: ; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $0, %xmm2, %eax +; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB23_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load +; SSE42-NEXT: jne LBB23_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB23_3 +; SSE42-NEXT: LBB23_4: ## %else2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB23_5 +; SSE42-NEXT: LBB23_6: ## %else5 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB23_7 +; SSE42-NEXT: LBB23_8: ## %else8 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB23_9 +; SSE42-NEXT: LBB23_10: ## %else11 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB23_11 +; SSE42-NEXT: LBB23_12: ## %else14 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB23_13 +; SSE42-NEXT: LBB23_14: ## %else17 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB23_15 +; SSE42-NEXT: LBB23_16: ## %else20 +; SSE42-NEXT: testl $256, %eax ## imm = 0x100 +; SSE42-NEXT: jne LBB23_17 +; SSE42-NEXT: LBB23_18: ## %else23 +; SSE42-NEXT: testl $512, %eax ## imm = 0x200 +; SSE42-NEXT: jne LBB23_19 +; SSE42-NEXT: LBB23_20: ## %else26 +; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE42-NEXT: jne LBB23_21 +; SSE42-NEXT: LBB23_22: ## %else29 +; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE42-NEXT: jne LBB23_23 +; SSE42-NEXT: LBB23_24: ## %else32 +; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE42-NEXT: jne LBB23_25 +; SSE42-NEXT: LBB23_26: ## %else35 +; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE42-NEXT: jne LBB23_27 +; SSE42-NEXT: LBB23_28: ## %else38 +; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE42-NEXT: jne LBB23_29 +; SSE42-NEXT: LBB23_30: ## %else41 +; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE42-NEXT: jne LBB23_31 +; SSE42-NEXT: LBB23_32: ## %else44 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB23_1: ## %cond.load ; SSE42-NEXT: pinsrb $0, (%rdi), %xmm1 -; SSE42-NEXT: LBB23_2: ## %else -; SSE42-NEXT: pextrb $1, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $2, %al ; SSE42-NEXT: je LBB23_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 +; SSE42-NEXT: LBB23_3: ## %cond.load1 ; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm1 -; SSE42-NEXT: LBB23_4: ## %else2 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $2, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $4, %al ; SSE42-NEXT: je LBB23_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 +; SSE42-NEXT: LBB23_5: ## %cond.load4 ; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm1 -; SSE42-NEXT: LBB23_6: ## %else5 -; SSE42-NEXT: pextrb $3, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $8, %al ; SSE42-NEXT: je LBB23_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 +; SSE42-NEXT: LBB23_7: ## %cond.load7 ; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm1 -; SSE42-NEXT: LBB23_8: ## %else8 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $4, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $16, %al ; SSE42-NEXT: je LBB23_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 +; SSE42-NEXT: LBB23_9: ## %cond.load10 ; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm1 -; SSE42-NEXT: LBB23_10: ## %else11 -; SSE42-NEXT: pextrb $5, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $32, %al ; SSE42-NEXT: je LBB23_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 +; SSE42-NEXT: LBB23_11: ## %cond.load13 ; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm1 -; SSE42-NEXT: LBB23_12: ## %else14 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $6, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $64, %al ; SSE42-NEXT: je LBB23_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 +; SSE42-NEXT: LBB23_13: ## %cond.load16 ; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm1 -; SSE42-NEXT: LBB23_14: ## %else17 -; SSE42-NEXT: pextrb $7, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB23_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 +; SSE42-NEXT: LBB23_15: ## %cond.load19 ; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm1 -; SSE42-NEXT: LBB23_16: ## %else20 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $8, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $256, %eax ## imm = 0x100 ; SSE42-NEXT: je LBB23_18 -; SSE42-NEXT: ## %bb.17: ## %cond.load22 +; SSE42-NEXT: LBB23_17: ## %cond.load22 ; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm1 -; SSE42-NEXT: LBB23_18: ## %else23 -; SSE42-NEXT: pextrb $9, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $512, %eax ## imm = 0x200 ; SSE42-NEXT: je LBB23_20 -; SSE42-NEXT: ## %bb.19: ## %cond.load25 +; SSE42-NEXT: LBB23_19: ## %cond.load25 ; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm1 -; SSE42-NEXT: LBB23_20: ## %else26 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $10, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE42-NEXT: je LBB23_22 -; SSE42-NEXT: ## %bb.21: ## %cond.load28 +; SSE42-NEXT: LBB23_21: ## %cond.load28 ; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm1 -; SSE42-NEXT: LBB23_22: ## %else29 -; SSE42-NEXT: pextrb $11, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE42-NEXT: je LBB23_24 -; SSE42-NEXT: ## %bb.23: ## %cond.load31 +; SSE42-NEXT: LBB23_23: ## %cond.load31 ; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm1 -; SSE42-NEXT: LBB23_24: ## %else32 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $12, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE42-NEXT: je LBB23_26 -; SSE42-NEXT: ## %bb.25: ## %cond.load34 +; SSE42-NEXT: LBB23_25: ## %cond.load34 ; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm1 -; SSE42-NEXT: LBB23_26: ## %else35 -; SSE42-NEXT: pextrb $13, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE42-NEXT: je LBB23_28 -; SSE42-NEXT: ## %bb.27: ## %cond.load37 +; SSE42-NEXT: LBB23_27: ## %cond.load37 ; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm1 -; SSE42-NEXT: LBB23_28: ## %else38 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE42-NEXT: pextrb $14, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE42-NEXT: je LBB23_30 -; SSE42-NEXT: ## %bb.29: ## %cond.load40 +; SSE42-NEXT: LBB23_29: ## %cond.load40 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm1 -; SSE42-NEXT: LBB23_30: ## %else41 -; SSE42-NEXT: pextrb $15, %xmm2, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE42-NEXT: je LBB23_32 -; SSE42-NEXT: ## %bb.31: ## %cond.load43 +; SSE42-NEXT: LBB23_31: ## %cond.load43 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm1 -; SSE42-NEXT: LBB23_32: ## %else44 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: load_v16i8_v16i8: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $0, %xmm2, %eax +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax ; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB23_2 -; AVX1OR2-NEXT: ## %bb.1: ## %cond.load -; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_2: ## %else -; AVX1OR2-NEXT: vpextrb $1, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB23_4 -; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1 -; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 +; AVX1OR2-NEXT: jne LBB23_1 +; AVX1OR2-NEXT: ## %bb.2: ## %else +; AVX1OR2-NEXT: testb $2, %al +; AVX1OR2-NEXT: jne LBB23_3 ; AVX1OR2-NEXT: LBB23_4: ## %else2 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB23_6 -; AVX1OR2-NEXT: ## %bb.5: ## %cond.load4 -; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 +; AVX1OR2-NEXT: testb $4, %al +; AVX1OR2-NEXT: jne LBB23_5 ; AVX1OR2-NEXT: LBB23_6: ## %else5 -; AVX1OR2-NEXT: vpextrb $3, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $8, %al +; AVX1OR2-NEXT: jne LBB23_7 +; AVX1OR2-NEXT: LBB23_8: ## %else8 +; AVX1OR2-NEXT: testb $16, %al +; AVX1OR2-NEXT: jne LBB23_9 +; AVX1OR2-NEXT: LBB23_10: ## %else11 +; AVX1OR2-NEXT: testb $32, %al +; AVX1OR2-NEXT: jne LBB23_11 +; AVX1OR2-NEXT: LBB23_12: ## %else14 +; AVX1OR2-NEXT: testb $64, %al +; AVX1OR2-NEXT: jne LBB23_13 +; AVX1OR2-NEXT: LBB23_14: ## %else17 +; AVX1OR2-NEXT: testb $-128, %al +; AVX1OR2-NEXT: jne LBB23_15 +; AVX1OR2-NEXT: LBB23_16: ## %else20 +; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1OR2-NEXT: jne LBB23_17 +; AVX1OR2-NEXT: LBB23_18: ## %else23 +; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1OR2-NEXT: jne LBB23_19 +; AVX1OR2-NEXT: LBB23_20: ## %else26 +; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1OR2-NEXT: jne LBB23_21 +; AVX1OR2-NEXT: LBB23_22: ## %else29 +; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1OR2-NEXT: jne LBB23_23 +; AVX1OR2-NEXT: LBB23_24: ## %else32 +; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1OR2-NEXT: jne LBB23_25 +; AVX1OR2-NEXT: LBB23_26: ## %else35 +; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1OR2-NEXT: jne LBB23_27 +; AVX1OR2-NEXT: LBB23_28: ## %else38 +; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1OR2-NEXT: jne LBB23_29 +; AVX1OR2-NEXT: LBB23_30: ## %else41 +; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1OR2-NEXT: jne LBB23_31 +; AVX1OR2-NEXT: LBB23_32: ## %else44 +; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: LBB23_1: ## %cond.load +; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 +; AVX1OR2-NEXT: testb $2, %al +; AVX1OR2-NEXT: je LBB23_4 +; AVX1OR2-NEXT: LBB23_3: ## %cond.load1 +; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 +; AVX1OR2-NEXT: testb $4, %al +; AVX1OR2-NEXT: je LBB23_6 +; AVX1OR2-NEXT: LBB23_5: ## %cond.load4 +; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 +; AVX1OR2-NEXT: testb $8, %al ; AVX1OR2-NEXT: je LBB23_8 -; AVX1OR2-NEXT: ## %bb.7: ## %cond.load7 +; AVX1OR2-NEXT: LBB23_7: ## %cond.load7 ; AVX1OR2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_8: ## %else8 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $16, %al ; AVX1OR2-NEXT: je LBB23_10 -; AVX1OR2-NEXT: ## %bb.9: ## %cond.load10 +; AVX1OR2-NEXT: LBB23_9: ## %cond.load10 ; AVX1OR2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_10: ## %else11 -; AVX1OR2-NEXT: vpextrb $5, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $32, %al ; AVX1OR2-NEXT: je LBB23_12 -; AVX1OR2-NEXT: ## %bb.11: ## %cond.load13 +; AVX1OR2-NEXT: LBB23_11: ## %cond.load13 ; AVX1OR2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_12: ## %else14 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: je LBB23_14 -; AVX1OR2-NEXT: ## %bb.13: ## %cond.load16 +; AVX1OR2-NEXT: LBB23_13: ## %cond.load16 ; AVX1OR2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_14: ## %else17 -; AVX1OR2-NEXT: vpextrb $7, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $-128, %al ; AVX1OR2-NEXT: je LBB23_16 -; AVX1OR2-NEXT: ## %bb.15: ## %cond.load19 +; AVX1OR2-NEXT: LBB23_15: ## %cond.load19 ; AVX1OR2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_16: ## %else20 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1OR2-NEXT: je LBB23_18 -; AVX1OR2-NEXT: ## %bb.17: ## %cond.load22 +; AVX1OR2-NEXT: LBB23_17: ## %cond.load22 ; AVX1OR2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_18: ## %else23 -; AVX1OR2-NEXT: vpextrb $9, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX1OR2-NEXT: je LBB23_20 -; AVX1OR2-NEXT: ## %bb.19: ## %cond.load25 +; AVX1OR2-NEXT: LBB23_19: ## %cond.load25 ; AVX1OR2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_20: ## %else26 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1OR2-NEXT: je LBB23_22 -; AVX1OR2-NEXT: ## %bb.21: ## %cond.load28 +; AVX1OR2-NEXT: LBB23_21: ## %cond.load28 ; AVX1OR2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_22: ## %else29 -; AVX1OR2-NEXT: vpextrb $11, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1OR2-NEXT: je LBB23_24 -; AVX1OR2-NEXT: ## %bb.23: ## %cond.load31 +; AVX1OR2-NEXT: LBB23_23: ## %cond.load31 ; AVX1OR2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_24: ## %else32 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1OR2-NEXT: je LBB23_26 -; AVX1OR2-NEXT: ## %bb.25: ## %cond.load34 +; AVX1OR2-NEXT: LBB23_25: ## %cond.load34 ; AVX1OR2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_26: ## %else35 -; AVX1OR2-NEXT: vpextrb $13, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1OR2-NEXT: je LBB23_28 -; AVX1OR2-NEXT: ## %bb.27: ## %cond.load37 +; AVX1OR2-NEXT: LBB23_27: ## %cond.load37 ; AVX1OR2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_28: ## %else38 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX1OR2-NEXT: vpextrb $14, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1OR2-NEXT: je LBB23_30 -; AVX1OR2-NEXT: ## %bb.29: ## %cond.load40 +; AVX1OR2-NEXT: LBB23_29: ## %cond.load40 ; AVX1OR2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_30: ## %else41 -; AVX1OR2-NEXT: vpextrb $15, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1OR2-NEXT: je LBB23_32 -; AVX1OR2-NEXT: ## %bb.31: ## %cond.load43 +; AVX1OR2-NEXT: LBB23_31: ## %cond.load43 ; AVX1OR2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 -; AVX1OR2-NEXT: LBB23_32: ## %else44 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: load_v16i8_v16i8: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %xmm0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB23_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.load +; AVX512F-NEXT: jne LBB23_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB23_3 +; AVX512F-NEXT: LBB23_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB23_5 +; AVX512F-NEXT: LBB23_6: ## %else5 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB23_7 +; AVX512F-NEXT: LBB23_8: ## %else8 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB23_9 +; AVX512F-NEXT: LBB23_10: ## %else11 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB23_11 +; AVX512F-NEXT: LBB23_12: ## %else14 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB23_13 +; AVX512F-NEXT: LBB23_14: ## %else17 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB23_15 +; AVX512F-NEXT: LBB23_16: ## %else20 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: jne LBB23_17 +; AVX512F-NEXT: LBB23_18: ## %else23 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB23_19 +; AVX512F-NEXT: LBB23_20: ## %else26 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB23_21 +; AVX512F-NEXT: LBB23_22: ## %else29 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB23_23 +; AVX512F-NEXT: LBB23_24: ## %else32 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB23_25 +; AVX512F-NEXT: LBB23_26: ## %else35 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB23_27 +; AVX512F-NEXT: LBB23_28: ## %else38 +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB23_29 +; AVX512F-NEXT: LBB23_30: ## %else41 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB23_31 +; AVX512F-NEXT: LBB23_32: ## %else44 +; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB23_1: ## %cond.load ; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB23_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.load1 +; AVX512F-NEXT: LBB23_3: ## %cond.load1 ; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB23_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.load4 +; AVX512F-NEXT: LBB23_5: ## %cond.load4 ; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_6: ## %else5 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB23_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.load7 +; AVX512F-NEXT: LBB23_7: ## %cond.load7 ; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_8: ## %else8 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB23_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.load10 +; AVX512F-NEXT: LBB23_9: ## %cond.load10 ; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_10: ## %else11 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB23_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.load13 +; AVX512F-NEXT: LBB23_11: ## %cond.load13 ; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_12: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB23_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.load16 +; AVX512F-NEXT: LBB23_13: ## %cond.load16 ; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_14: ## %else17 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je LBB23_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.load19 +; AVX512F-NEXT: LBB23_15: ## %cond.load19 ; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_16: ## %else20 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB23_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.load22 +; AVX512F-NEXT: LBB23_17: ## %cond.load22 ; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_18: ## %else23 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB23_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.load25 +; AVX512F-NEXT: LBB23_19: ## %cond.load25 ; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_20: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB23_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.load28 +; AVX512F-NEXT: LBB23_21: ## %cond.load28 ; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_22: ## %else29 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB23_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.load31 +; AVX512F-NEXT: LBB23_23: ## %cond.load31 ; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_24: ## %else32 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB23_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.load34 +; AVX512F-NEXT: LBB23_25: ## %cond.load34 ; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_26: ## %else35 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB23_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.load37 +; AVX512F-NEXT: LBB23_27: ## %cond.load37 ; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_28: ## %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB23_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.load40 +; AVX512F-NEXT: LBB23_29: ## %cond.load40 ; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_30: ## %else41 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB23_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.load43 +; AVX512F-NEXT: LBB23_31: ## %cond.load43 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: LBB23_32: ## %else44 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_v16i8_v16i8: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB23_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: jne LBB23_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB23_3 +; AVX512VLDQ-NEXT: LBB23_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB23_5 +; AVX512VLDQ-NEXT: LBB23_6: ## %else5 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB23_7 +; AVX512VLDQ-NEXT: LBB23_8: ## %else8 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB23_9 +; AVX512VLDQ-NEXT: LBB23_10: ## %else11 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB23_11 +; AVX512VLDQ-NEXT: LBB23_12: ## %else14 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB23_13 +; AVX512VLDQ-NEXT: LBB23_14: ## %else17 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB23_15 +; AVX512VLDQ-NEXT: LBB23_16: ## %else20 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: jne LBB23_17 +; AVX512VLDQ-NEXT: LBB23_18: ## %else23 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB23_19 +; AVX512VLDQ-NEXT: LBB23_20: ## %else26 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB23_21 +; AVX512VLDQ-NEXT: LBB23_22: ## %else29 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB23_23 +; AVX512VLDQ-NEXT: LBB23_24: ## %else32 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB23_25 +; AVX512VLDQ-NEXT: LBB23_26: ## %else35 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB23_27 +; AVX512VLDQ-NEXT: LBB23_28: ## %else38 +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB23_29 +; AVX512VLDQ-NEXT: LBB23_30: ## %else41 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB23_31 +; AVX512VLDQ-NEXT: LBB23_32: ## %else44 +; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB23_1: ## %cond.load ; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_2: ## %else -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB23_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: LBB23_3: ## %cond.load1 ; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB23_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: LBB23_5: ## %cond.load4 ; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_6: ## %else5 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB23_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: LBB23_7: ## %cond.load7 ; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_8: ## %else8 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB23_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: LBB23_9: ## %cond.load10 ; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_10: ## %else11 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB23_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: LBB23_11: ## %cond.load13 ; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_12: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB23_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: LBB23_13: ## %cond.load16 ; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_14: ## %else17 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al ; AVX512VLDQ-NEXT: je LBB23_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: LBB23_15: ## %cond.load19 ; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_16: ## %else20 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB23_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 +; AVX512VLDQ-NEXT: LBB23_17: ## %cond.load22 ; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_18: ## %else23 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB23_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 +; AVX512VLDQ-NEXT: LBB23_19: ## %cond.load25 ; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_20: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB23_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 +; AVX512VLDQ-NEXT: LBB23_21: ## %cond.load28 ; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_22: ## %else29 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB23_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 +; AVX512VLDQ-NEXT: LBB23_23: ## %cond.load31 ; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_24: ## %else32 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB23_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 +; AVX512VLDQ-NEXT: LBB23_25: ## %cond.load34 ; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_26: ## %else35 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB23_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 +; AVX512VLDQ-NEXT: LBB23_27: ## %cond.load37 ; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_28: ## %else38 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB23_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 +; AVX512VLDQ-NEXT: LBB23_29: ## %cond.load40 ; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_30: ## %else41 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB23_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 +; AVX512VLDQ-NEXT: LBB23_31: ## %cond.load43 ; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 -; AVX512VLDQ-NEXT: LBB23_32: ## %else44 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: load_v16i8_v16i8: @@ -4639,402 +4361,412 @@ define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> %dst) { ; SSE2-LABEL: load_v32i8_v32i8: ; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_2 -; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl (%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: LBB24_2: ## %else -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_4 -; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 1(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: psllw $8, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pmovmskb %xmm0, %ecx +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne LBB24_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB24_3 ; SSE2-NEXT: LBB24_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB24_6 -; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 2(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB24_5 ; SSE2-NEXT: LBB24_6: ## %else5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_8 -; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pslld $24, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB24_7 ; SSE2-NEXT: LBB24_8: ## %else8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_10 -; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 4(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB24_9 ; SSE2-NEXT: LBB24_10: ## %else11 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_12 -; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 5(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: psllq $40, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB24_11 ; SSE2-NEXT: LBB24_12: ## %else14 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $3, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_14 -; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 6(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: psllq $48, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB24_13 ; SSE2-NEXT: LBB24_14: ## %else17 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_16 -; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 7(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: psllq $56, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB24_15 ; SSE2-NEXT: LBB24_16: ## %else20 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_18 -; SSE2-NEXT: ## %bb.17: ## %cond.load22 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 8(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: jne LBB24_17 ; SSE2-NEXT: LBB24_18: ## %else23 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_20 -; SSE2-NEXT: ## %bb.19: ## %cond.load25 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 9(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 +; SSE2-NEXT: jne LBB24_19 ; SSE2-NEXT: LBB24_20: ## %else26 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $5, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_22 -; SSE2-NEXT: ## %bb.21: ## %cond.load28 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 10(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: jne LBB24_21 ; SSE2-NEXT: LBB24_22: ## %else29 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_24 -; SSE2-NEXT: ## %bb.23: ## %cond.load31 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 11(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE2-NEXT: jne LBB24_23 ; SSE2-NEXT: LBB24_24: ## %else32 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_26 -; SSE2-NEXT: ## %bb.25: ## %cond.load34 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 12(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: jne LBB24_25 ; SSE2-NEXT: LBB24_26: ## %else35 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_28 -; SSE2-NEXT: ## %bb.27: ## %cond.load37 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movzbl 13(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE2-NEXT: jne LBB24_27 ; SSE2-NEXT: LBB24_28: ## %else38 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_30 -; SSE2-NEXT: ## %bb.29: ## %cond.load40 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: movzbl 14(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: jne LBB24_29 ; SSE2-NEXT: LBB24_30: ## %else41 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_32 -; SSE2-NEXT: ## %bb.31: ## %cond.load43 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movzbl 15(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: jne LBB24_31 ; SSE2-NEXT: LBB24_32: ## %else44 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_34 -; SSE2-NEXT: ## %bb.33: ## %cond.load46 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 16(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 +; SSE2-NEXT: jne LBB24_33 ; SSE2-NEXT: LBB24_34: ## %else47 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_36 -; SSE2-NEXT: ## %bb.35: ## %cond.load49 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 17(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: psllw $8, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE2-NEXT: jne LBB24_35 ; SSE2-NEXT: LBB24_36: ## %else50 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB24_38 -; SSE2-NEXT: ## %bb.37: ## %cond.load52 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE2-NEXT: jne LBB24_37 +; SSE2-NEXT: LBB24_38: ## %else53 +; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 +; SSE2-NEXT: jne LBB24_39 +; SSE2-NEXT: LBB24_40: ## %else56 +; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 +; SSE2-NEXT: jne LBB24_41 +; SSE2-NEXT: LBB24_42: ## %else59 +; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 +; SSE2-NEXT: jne LBB24_43 +; SSE2-NEXT: LBB24_44: ## %else62 +; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 +; SSE2-NEXT: jne LBB24_45 +; SSE2-NEXT: LBB24_46: ## %else65 +; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 +; SSE2-NEXT: jne LBB24_47 +; SSE2-NEXT: LBB24_48: ## %else68 +; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; SSE2-NEXT: jne LBB24_49 +; SSE2-NEXT: LBB24_50: ## %else71 +; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; SSE2-NEXT: jne LBB24_51 +; SSE2-NEXT: LBB24_52: ## %else74 +; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; SSE2-NEXT: jne LBB24_53 +; SSE2-NEXT: LBB24_54: ## %else77 +; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; SSE2-NEXT: jne LBB24_55 +; SSE2-NEXT: LBB24_56: ## %else80 +; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; SSE2-NEXT: jne LBB24_57 +; SSE2-NEXT: LBB24_58: ## %else83 +; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; SSE2-NEXT: jne LBB24_59 +; SSE2-NEXT: LBB24_60: ## %else86 +; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; SSE2-NEXT: jne LBB24_61 +; SSE2-NEXT: LBB24_62: ## %else89 +; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE2-NEXT: je LBB24_64 +; SSE2-NEXT: LBB24_63: ## %cond.load91 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: movzbl 31(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: LBB24_64: ## %else92 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB24_1: ## %cond.load +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl (%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB24_4 +; SSE2-NEXT: LBB24_3: ## %cond.load1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 1(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB24_6 +; SSE2-NEXT: LBB24_5: ## %cond.load4 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 2(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je LBB24_8 +; SSE2-NEXT: LBB24_7: ## %cond.load7 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je LBB24_10 +; SSE2-NEXT: LBB24_9: ## %cond.load10 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 4(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je LBB24_12 +; SSE2-NEXT: LBB24_11: ## %cond.load13 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 5(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $40, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je LBB24_14 +; SSE2-NEXT: LBB24_13: ## %cond.load16 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 6(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: je LBB24_16 +; SSE2-NEXT: LBB24_15: ## %cond.load19 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 7(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: je LBB24_18 +; SSE2-NEXT: LBB24_17: ## %cond.load22 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 8(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 +; SSE2-NEXT: je LBB24_20 +; SSE2-NEXT: LBB24_19: ## %cond.load25 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 9(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: je LBB24_22 +; SSE2-NEXT: LBB24_21: ## %cond.load28 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 10(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE2-NEXT: je LBB24_24 +; SSE2-NEXT: LBB24_23: ## %cond.load31 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 11(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: je LBB24_26 +; SSE2-NEXT: LBB24_25: ## %cond.load34 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 12(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE2-NEXT: je LBB24_28 +; SSE2-NEXT: LBB24_27: ## %cond.load37 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 13(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: je LBB24_30 +; SSE2-NEXT: LBB24_29: ## %cond.load40 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movzbl 14(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: je LBB24_32 +; SSE2-NEXT: LBB24_31: ## %cond.load43 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movzbl 15(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 +; SSE2-NEXT: je LBB24_34 +; SSE2-NEXT: LBB24_33: ## %cond.load46 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movzbl 16(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE2-NEXT: je LBB24_36 +; SSE2-NEXT: LBB24_35: ## %cond.load49 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movzbl 17(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE2-NEXT: je LBB24_38 +; SSE2-NEXT: LBB24_37: ## %cond.load52 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 18(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_38: ## %else53 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 ; SSE2-NEXT: je LBB24_40 -; SSE2-NEXT: ## %bb.39: ## %cond.load55 +; SSE2-NEXT: LBB24_39: ## %cond.load55 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 19(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pslld $24, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 19(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_40: ## %else56 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; SSE2-NEXT: je LBB24_42 -; SSE2-NEXT: ## %bb.41: ## %cond.load58 +; SSE2-NEXT: LBB24_41: ## %cond.load58 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 20(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_42: ## %else59 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; SSE2-NEXT: je LBB24_44 -; SSE2-NEXT: ## %bb.43: ## %cond.load61 +; SSE2-NEXT: LBB24_43: ## %cond.load61 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 21(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: psllq $40, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 21(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $40, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_44: ## %else62 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE2-NEXT: je LBB24_46 -; SSE2-NEXT: ## %bb.45: ## %cond.load64 +; SSE2-NEXT: LBB24_45: ## %cond.load64 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 22(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: psllq $48, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_46: ## %else65 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; SSE2-NEXT: je LBB24_48 -; SSE2-NEXT: ## %bb.47: ## %cond.load67 +; SSE2-NEXT: LBB24_47: ## %cond.load67 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 23(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: psllq $56, %xmm4 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 23(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_48: ## %else68 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; SSE2-NEXT: je LBB24_50 -; SSE2-NEXT: ## %bb.49: ## %cond.load70 +; SSE2-NEXT: LBB24_49: ## %cond.load70 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 24(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_50: ## %else71 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; SSE2-NEXT: je LBB24_52 -; SSE2-NEXT: ## %bb.51: ## %cond.load73 +; SSE2-NEXT: LBB24_51: ## %cond.load73 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 25(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 25(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_52: ## %else74 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE2-NEXT: je LBB24_54 -; SSE2-NEXT: ## %bb.53: ## %cond.load76 +; SSE2-NEXT: LBB24_53: ## %cond.load76 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 26(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_54: ## %else77 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; SSE2-NEXT: je LBB24_56 -; SSE2-NEXT: ## %bb.55: ## %cond.load79 +; SSE2-NEXT: LBB24_55: ## %cond.load79 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 27(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 27(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_56: ## %else80 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; SSE2-NEXT: je LBB24_58 -; SSE2-NEXT: ## %bb.57: ## %cond.load82 +; SSE2-NEXT: LBB24_57: ## %cond.load82 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 28(%rdi), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_58: ## %else83 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; SSE2-NEXT: je LBB24_60 -; SSE2-NEXT: ## %bb.59: ## %cond.load85 +; SSE2-NEXT: LBB24_59: ## %cond.load85 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movzbl 29(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: movzbl 29(%rdi), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_60: ## %else86 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE2-NEXT: je LBB24_62 -; SSE2-NEXT: ## %bb.61: ## %cond.load88 +; SSE2-NEXT: LBB24_61: ## %cond.load88 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: movzbl 30(%rdi), %ecx @@ -5042,1548 +4774,1356 @@ ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_62: ## %else89 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_64 -; SSE2-NEXT: ## %bb.63: ## %cond.load91 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: movzbl 31(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: LBB24_64: ## %else92 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: retq +; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE2-NEXT: jne LBB24_63 +; SSE2-NEXT: jmp LBB24_64 ; ; SSE42-LABEL: load_v32i8_v32i8: ; SSE42: ## %bb.0: -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $0, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_2 -; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2 -; SSE42-NEXT: LBB24_2: ## %else -; SSE42-NEXT: pextrb $1, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_4 -; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2 +; SSE42-NEXT: pmovmskb %xmm0, %ecx +; SSE42-NEXT: pmovmskb %xmm1, %eax +; SSE42-NEXT: shll $16, %eax +; SSE42-NEXT: orl %ecx, %eax +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: jne LBB24_1 +; SSE42-NEXT: ## %bb.2: ## %else +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: jne LBB24_3 ; SSE42-NEXT: LBB24_4: ## %else2 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $2, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_6 -; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: jne LBB24_5 ; SSE42-NEXT: LBB24_6: ## %else5 -; SSE42-NEXT: pextrb $3, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_8 -; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: jne LBB24_7 ; SSE42-NEXT: LBB24_8: ## %else8 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $4, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_10 -; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: jne LBB24_9 ; SSE42-NEXT: LBB24_10: ## %else11 -; SSE42-NEXT: pextrb $5, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_12 -; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: jne LBB24_11 ; SSE42-NEXT: LBB24_12: ## %else14 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $6, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_14 -; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: jne LBB24_13 ; SSE42-NEXT: LBB24_14: ## %else17 -; SSE42-NEXT: pextrb $7, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_16 -; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: jne LBB24_15 ; SSE42-NEXT: LBB24_16: ## %else20 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $8, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_18 -; SSE42-NEXT: ## %bb.17: ## %cond.load22 -; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2 +; SSE42-NEXT: testl $256, %eax ## imm = 0x100 +; SSE42-NEXT: jne LBB24_17 ; SSE42-NEXT: LBB24_18: ## %else23 -; SSE42-NEXT: pextrb $9, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_20 -; SSE42-NEXT: ## %bb.19: ## %cond.load25 -; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2 +; SSE42-NEXT: testl $512, %eax ## imm = 0x200 +; SSE42-NEXT: jne LBB24_19 ; SSE42-NEXT: LBB24_20: ## %else26 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $10, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_22 -; SSE42-NEXT: ## %bb.21: ## %cond.load28 -; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2 +; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE42-NEXT: jne LBB24_21 ; SSE42-NEXT: LBB24_22: ## %else29 -; SSE42-NEXT: pextrb $11, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_24 -; SSE42-NEXT: ## %bb.23: ## %cond.load31 -; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2 +; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE42-NEXT: jne LBB24_23 ; SSE42-NEXT: LBB24_24: ## %else32 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $12, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_26 -; SSE42-NEXT: ## %bb.25: ## %cond.load34 -; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2 +; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE42-NEXT: jne LBB24_25 ; SSE42-NEXT: LBB24_26: ## %else35 -; SSE42-NEXT: pextrb $13, %xmm4, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_28 -; SSE42-NEXT: ## %bb.27: ## %cond.load37 -; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2 +; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE42-NEXT: jne LBB24_27 ; SSE42-NEXT: LBB24_28: ## %else38 -; SSE42-NEXT: pxor %xmm4, %xmm4 -; SSE42-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE42-NEXT: pextrb $14, %xmm4, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE42-NEXT: jne LBB24_29 +; SSE42-NEXT: LBB24_30: ## %else41 +; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE42-NEXT: jne LBB24_31 +; SSE42-NEXT: LBB24_32: ## %else44 +; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000 +; SSE42-NEXT: jne LBB24_33 +; SSE42-NEXT: LBB24_34: ## %else47 +; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE42-NEXT: jne LBB24_35 +; SSE42-NEXT: LBB24_36: ## %else50 +; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE42-NEXT: jne LBB24_37 +; SSE42-NEXT: LBB24_38: ## %else53 +; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000 +; SSE42-NEXT: jne LBB24_39 +; SSE42-NEXT: LBB24_40: ## %else56 +; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000 +; SSE42-NEXT: jne LBB24_41 +; SSE42-NEXT: LBB24_42: ## %else59 +; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000 +; SSE42-NEXT: jne LBB24_43 +; SSE42-NEXT: LBB24_44: ## %else62 +; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000 +; SSE42-NEXT: jne LBB24_45 +; SSE42-NEXT: LBB24_46: ## %else65 +; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000 +; SSE42-NEXT: jne LBB24_47 +; SSE42-NEXT: LBB24_48: ## %else68 +; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; SSE42-NEXT: jne LBB24_49 +; SSE42-NEXT: LBB24_50: ## %else71 +; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; SSE42-NEXT: jne LBB24_51 +; SSE42-NEXT: LBB24_52: ## %else74 +; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; SSE42-NEXT: jne LBB24_53 +; SSE42-NEXT: LBB24_54: ## %else77 +; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; SSE42-NEXT: jne LBB24_55 +; SSE42-NEXT: LBB24_56: ## %else80 +; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; SSE42-NEXT: jne LBB24_57 +; SSE42-NEXT: LBB24_58: ## %else83 +; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; SSE42-NEXT: jne LBB24_59 +; SSE42-NEXT: LBB24_60: ## %else86 +; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; SSE42-NEXT: jne LBB24_61 +; SSE42-NEXT: LBB24_62: ## %else89 +; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE42-NEXT: je LBB24_64 +; SSE42-NEXT: LBB24_63: ## %cond.load91 +; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3 +; SSE42-NEXT: LBB24_64: ## %else92 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: movdqa %xmm3, %xmm1 +; SSE42-NEXT: retq +; SSE42-NEXT: LBB24_1: ## %cond.load +; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2 +; SSE42-NEXT: testb $2, %al +; SSE42-NEXT: je LBB24_4 +; SSE42-NEXT: LBB24_3: ## %cond.load1 +; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2 +; SSE42-NEXT: testb $4, %al +; SSE42-NEXT: je LBB24_6 +; SSE42-NEXT: LBB24_5: ## %cond.load4 +; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2 +; SSE42-NEXT: testb $8, %al +; SSE42-NEXT: je LBB24_8 +; SSE42-NEXT: LBB24_7: ## %cond.load7 +; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2 +; SSE42-NEXT: testb $16, %al +; SSE42-NEXT: je LBB24_10 +; SSE42-NEXT: LBB24_9: ## %cond.load10 +; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2 +; SSE42-NEXT: testb $32, %al +; SSE42-NEXT: je LBB24_12 +; SSE42-NEXT: LBB24_11: ## %cond.load13 +; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2 +; SSE42-NEXT: testb $64, %al +; SSE42-NEXT: je LBB24_14 +; SSE42-NEXT: LBB24_13: ## %cond.load16 +; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2 +; SSE42-NEXT: testb $-128, %al +; SSE42-NEXT: je LBB24_16 +; SSE42-NEXT: LBB24_15: ## %cond.load19 +; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2 +; SSE42-NEXT: testl $256, %eax ## imm = 0x100 +; SSE42-NEXT: je LBB24_18 +; SSE42-NEXT: LBB24_17: ## %cond.load22 +; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2 +; SSE42-NEXT: testl $512, %eax ## imm = 0x200 +; SSE42-NEXT: je LBB24_20 +; SSE42-NEXT: LBB24_19: ## %cond.load25 +; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2 +; SSE42-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE42-NEXT: je LBB24_22 +; SSE42-NEXT: LBB24_21: ## %cond.load28 +; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2 +; SSE42-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE42-NEXT: je LBB24_24 +; SSE42-NEXT: LBB24_23: ## %cond.load31 +; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2 +; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE42-NEXT: je LBB24_26 +; SSE42-NEXT: LBB24_25: ## %cond.load34 +; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2 +; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE42-NEXT: je LBB24_28 +; SSE42-NEXT: LBB24_27: ## %cond.load37 +; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2 +; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE42-NEXT: je LBB24_30 -; SSE42-NEXT: ## %bb.29: ## %cond.load40 +; SSE42-NEXT: LBB24_29: ## %cond.load40 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm2 -; SSE42-NEXT: LBB24_30: ## %else41 -; SSE42-NEXT: pextrb $15, %xmm4, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE42-NEXT: je LBB24_32 -; SSE42-NEXT: ## %bb.31: ## %cond.load43 +; SSE42-NEXT: LBB24_31: ## %cond.load43 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm2 -; SSE42-NEXT: LBB24_32: ## %else44 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $0, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000 ; SSE42-NEXT: je LBB24_34 -; SSE42-NEXT: ## %bb.33: ## %cond.load46 +; SSE42-NEXT: LBB24_33: ## %cond.load46 ; SSE42-NEXT: pinsrb $0, 16(%rdi), %xmm3 -; SSE42-NEXT: LBB24_34: ## %else47 -; SSE42-NEXT: pextrb $1, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000 ; SSE42-NEXT: je LBB24_36 -; SSE42-NEXT: ## %bb.35: ## %cond.load49 +; SSE42-NEXT: LBB24_35: ## %cond.load49 ; SSE42-NEXT: pinsrb $1, 17(%rdi), %xmm3 -; SSE42-NEXT: LBB24_36: ## %else50 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $2, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000 ; SSE42-NEXT: je LBB24_38 -; SSE42-NEXT: ## %bb.37: ## %cond.load52 +; SSE42-NEXT: LBB24_37: ## %cond.load52 ; SSE42-NEXT: pinsrb $2, 18(%rdi), %xmm3 -; SSE42-NEXT: LBB24_38: ## %else53 -; SSE42-NEXT: pextrb $3, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000 ; SSE42-NEXT: je LBB24_40 -; SSE42-NEXT: ## %bb.39: ## %cond.load55 +; SSE42-NEXT: LBB24_39: ## %cond.load55 ; SSE42-NEXT: pinsrb $3, 19(%rdi), %xmm3 -; SSE42-NEXT: LBB24_40: ## %else56 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $4, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000 ; SSE42-NEXT: je LBB24_42 -; SSE42-NEXT: ## %bb.41: ## %cond.load58 +; SSE42-NEXT: LBB24_41: ## %cond.load58 ; SSE42-NEXT: pinsrb $4, 20(%rdi), %xmm3 -; SSE42-NEXT: LBB24_42: ## %else59 -; SSE42-NEXT: pextrb $5, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000 ; SSE42-NEXT: je LBB24_44 -; SSE42-NEXT: ## %bb.43: ## %cond.load61 +; SSE42-NEXT: LBB24_43: ## %cond.load61 ; SSE42-NEXT: pinsrb $5, 21(%rdi), %xmm3 -; SSE42-NEXT: LBB24_44: ## %else62 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $6, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE42-NEXT: je LBB24_46 -; SSE42-NEXT: ## %bb.45: ## %cond.load64 +; SSE42-NEXT: LBB24_45: ## %cond.load64 ; SSE42-NEXT: pinsrb $6, 22(%rdi), %xmm3 -; SSE42-NEXT: LBB24_46: ## %else65 -; SSE42-NEXT: pextrb $7, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000 ; SSE42-NEXT: je LBB24_48 -; SSE42-NEXT: ## %bb.47: ## %cond.load67 +; SSE42-NEXT: LBB24_47: ## %cond.load67 ; SSE42-NEXT: pinsrb $7, 23(%rdi), %xmm3 -; SSE42-NEXT: LBB24_48: ## %else68 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $8, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; SSE42-NEXT: je LBB24_50 -; SSE42-NEXT: ## %bb.49: ## %cond.load70 +; SSE42-NEXT: LBB24_49: ## %cond.load70 ; SSE42-NEXT: pinsrb $8, 24(%rdi), %xmm3 -; SSE42-NEXT: LBB24_50: ## %else71 -; SSE42-NEXT: pextrb $9, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; SSE42-NEXT: je LBB24_52 -; SSE42-NEXT: ## %bb.51: ## %cond.load73 +; SSE42-NEXT: LBB24_51: ## %cond.load73 ; SSE42-NEXT: pinsrb $9, 25(%rdi), %xmm3 -; SSE42-NEXT: LBB24_52: ## %else74 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $10, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE42-NEXT: je LBB24_54 -; SSE42-NEXT: ## %bb.53: ## %cond.load76 +; SSE42-NEXT: LBB24_53: ## %cond.load76 ; SSE42-NEXT: pinsrb $10, 26(%rdi), %xmm3 -; SSE42-NEXT: LBB24_54: ## %else77 -; SSE42-NEXT: pextrb $11, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; SSE42-NEXT: je LBB24_56 -; SSE42-NEXT: ## %bb.55: ## %cond.load79 +; SSE42-NEXT: LBB24_55: ## %cond.load79 ; SSE42-NEXT: pinsrb $11, 27(%rdi), %xmm3 -; SSE42-NEXT: LBB24_56: ## %else80 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $12, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; SSE42-NEXT: je LBB24_58 -; SSE42-NEXT: ## %bb.57: ## %cond.load82 +; SSE42-NEXT: LBB24_57: ## %cond.load82 ; SSE42-NEXT: pinsrb $12, 28(%rdi), %xmm3 -; SSE42-NEXT: LBB24_58: ## %else83 -; SSE42-NEXT: pextrb $13, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; SSE42-NEXT: je LBB24_60 -; SSE42-NEXT: ## %bb.59: ## %cond.load85 +; SSE42-NEXT: LBB24_59: ## %cond.load85 ; SSE42-NEXT: pinsrb $13, 29(%rdi), %xmm3 -; SSE42-NEXT: LBB24_60: ## %else86 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE42-NEXT: pextrb $14, %xmm0, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE42-NEXT: je LBB24_62 -; SSE42-NEXT: ## %bb.61: ## %cond.load88 +; SSE42-NEXT: LBB24_61: ## %cond.load88 ; SSE42-NEXT: pinsrb $14, 30(%rdi), %xmm3 -; SSE42-NEXT: LBB24_62: ## %else89 -; SSE42-NEXT: pextrb $15, %xmm0, %eax -; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: je LBB24_64 -; SSE42-NEXT: ## %bb.63: ## %cond.load91 -; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3 -; SSE42-NEXT: LBB24_64: ## %else92 -; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm1 -; SSE42-NEXT: retq +; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE42-NEXT: jne LBB24_63 +; SSE42-NEXT: jmp LBB24_64 ; ; AVX1-LABEL: load_v32i8_v32i8: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_2 -; AVX1-NEXT: ## %bb.1: ## %cond.load -; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: LBB24_2: ## %else -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_4 -; AVX1-NEXT: ## %bb.3: ## %cond.load1 -; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB24_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB24_3 ; AVX1-NEXT: LBB24_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_6 -; AVX1-NEXT: ## %bb.5: ## %cond.load4 -; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB24_5 ; AVX1-NEXT: LBB24_6: ## %else5 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_8 -; AVX1-NEXT: ## %bb.7: ## %cond.load7 -; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB24_7 ; AVX1-NEXT: LBB24_8: ## %else8 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_10 -; AVX1-NEXT: ## %bb.9: ## %cond.load10 -; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne LBB24_9 ; AVX1-NEXT: LBB24_10: ## %else11 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_12 -; AVX1-NEXT: ## %bb.11: ## %cond.load13 -; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne LBB24_11 ; AVX1-NEXT: LBB24_12: ## %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $6, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_14 -; AVX1-NEXT: ## %bb.13: ## %cond.load16 -; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne LBB24_13 ; AVX1-NEXT: LBB24_14: ## %else17 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_16 -; AVX1-NEXT: ## %bb.15: ## %cond.load19 -; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne LBB24_15 ; AVX1-NEXT: LBB24_16: ## %else20 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_18 -; AVX1-NEXT: ## %bb.17: ## %cond.load22 -; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: jne LBB24_17 ; AVX1-NEXT: LBB24_18: ## %else23 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_20 -; AVX1-NEXT: ## %bb.19: ## %cond.load25 -; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: jne LBB24_19 ; AVX1-NEXT: LBB24_20: ## %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $10, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_22 -; AVX1-NEXT: ## %bb.21: ## %cond.load28 -; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: jne LBB24_21 ; AVX1-NEXT: LBB24_22: ## %else29 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_24 -; AVX1-NEXT: ## %bb.23: ## %cond.load31 -; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: jne LBB24_23 ; AVX1-NEXT: LBB24_24: ## %else32 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_26 -; AVX1-NEXT: ## %bb.25: ## %cond.load34 -; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: jne LBB24_25 ; AVX1-NEXT: LBB24_26: ## %else35 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_28 -; AVX1-NEXT: ## %bb.27: ## %cond.load37 -; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: jne LBB24_27 ; AVX1-NEXT: LBB24_28: ## %else38 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $14, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_30 -; AVX1-NEXT: ## %bb.29: ## %cond.load40 -; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: jne LBB24_29 ; AVX1-NEXT: LBB24_30: ## %else41 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_32 -; AVX1-NEXT: ## %bb.31: ## %cond.load43 -; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: jne LBB24_31 ; AVX1-NEXT: LBB24_32: ## %else44 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_34 -; AVX1-NEXT: ## %bb.33: ## %cond.load46 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX1-NEXT: jne LBB24_33 ; AVX1-NEXT: LBB24_34: ## %else47 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_36 -; AVX1-NEXT: ## %bb.35: ## %cond.load49 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX1-NEXT: jne LBB24_35 ; AVX1-NEXT: LBB24_36: ## %else50 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_38 -; AVX1-NEXT: ## %bb.37: ## %cond.load52 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX1-NEXT: jne LBB24_37 ; AVX1-NEXT: LBB24_38: ## %else53 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_40 -; AVX1-NEXT: ## %bb.39: ## %cond.load55 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX1-NEXT: jne LBB24_39 ; AVX1-NEXT: LBB24_40: ## %else56 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_42 -; AVX1-NEXT: ## %bb.41: ## %cond.load58 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX1-NEXT: jne LBB24_41 ; AVX1-NEXT: LBB24_42: ## %else59 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_44 -; AVX1-NEXT: ## %bb.43: ## %cond.load61 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX1-NEXT: jne LBB24_43 ; AVX1-NEXT: LBB24_44: ## %else62 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_46 -; AVX1-NEXT: ## %bb.45: ## %cond.load64 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX1-NEXT: jne LBB24_45 ; AVX1-NEXT: LBB24_46: ## %else65 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_48 -; AVX1-NEXT: ## %bb.47: ## %cond.load67 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX1-NEXT: jne LBB24_47 ; AVX1-NEXT: LBB24_48: ## %else68 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_50 -; AVX1-NEXT: ## %bb.49: ## %cond.load70 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX1-NEXT: jne LBB24_49 ; AVX1-NEXT: LBB24_50: ## %else71 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_52 -; AVX1-NEXT: ## %bb.51: ## %cond.load73 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX1-NEXT: jne LBB24_51 ; AVX1-NEXT: LBB24_52: ## %else74 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_54 -; AVX1-NEXT: ## %bb.53: ## %cond.load76 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX1-NEXT: jne LBB24_53 ; AVX1-NEXT: LBB24_54: ## %else77 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_56 -; AVX1-NEXT: ## %bb.55: ## %cond.load79 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX1-NEXT: jne LBB24_55 ; AVX1-NEXT: LBB24_56: ## %else80 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_58 -; AVX1-NEXT: ## %bb.57: ## %cond.load82 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX1-NEXT: jne LBB24_57 ; AVX1-NEXT: LBB24_58: ## %else83 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_60 -; AVX1-NEXT: ## %bb.59: ## %cond.load85 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX1-NEXT: jne LBB24_59 ; AVX1-NEXT: LBB24_60: ## %else86 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB24_62 -; AVX1-NEXT: ## %bb.61: ## %cond.load88 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX1-NEXT: jne LBB24_61 ; AVX1-NEXT: LBB24_62: ## %else89 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX1-NEXT: jne LBB24_63 +; AVX1-NEXT: LBB24_64: ## %else92 +; AVX1-NEXT: vmovaps %ymm1, %ymm0 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB24_1: ## %cond.load +; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB24_4 +; AVX1-NEXT: LBB24_3: ## %cond.load1 +; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je LBB24_6 +; AVX1-NEXT: LBB24_5: ## %cond.load4 +; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je LBB24_8 +; AVX1-NEXT: LBB24_7: ## %cond.load7 +; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je LBB24_10 +; AVX1-NEXT: LBB24_9: ## %cond.load10 +; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je LBB24_12 +; AVX1-NEXT: LBB24_11: ## %cond.load13 +; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je LBB24_14 +; AVX1-NEXT: LBB24_13: ## %cond.load16 +; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je LBB24_16 +; AVX1-NEXT: LBB24_15: ## %cond.load19 +; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: je LBB24_18 +; AVX1-NEXT: LBB24_17: ## %cond.load22 +; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: je LBB24_20 +; AVX1-NEXT: LBB24_19: ## %cond.load25 +; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: je LBB24_22 +; AVX1-NEXT: LBB24_21: ## %cond.load28 +; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: je LBB24_24 +; AVX1-NEXT: LBB24_23: ## %cond.load31 +; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: je LBB24_26 +; AVX1-NEXT: LBB24_25: ## %cond.load34 +; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: je LBB24_28 +; AVX1-NEXT: LBB24_27: ## %cond.load37 +; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: je LBB24_30 +; AVX1-NEXT: LBB24_29: ## %cond.load40 +; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: je LBB24_32 +; AVX1-NEXT: LBB24_31: ## %cond.load43 +; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX1-NEXT: je LBB24_34 +; AVX1-NEXT: LBB24_33: ## %cond.load46 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX1-NEXT: je LBB24_36 +; AVX1-NEXT: LBB24_35: ## %cond.load49 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX1-NEXT: je LBB24_38 +; AVX1-NEXT: LBB24_37: ## %cond.load52 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX1-NEXT: je LBB24_40 +; AVX1-NEXT: LBB24_39: ## %cond.load55 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX1-NEXT: je LBB24_42 +; AVX1-NEXT: LBB24_41: ## %cond.load58 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX1-NEXT: je LBB24_44 +; AVX1-NEXT: LBB24_43: ## %cond.load61 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX1-NEXT: je LBB24_46 +; AVX1-NEXT: LBB24_45: ## %cond.load64 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX1-NEXT: je LBB24_48 +; AVX1-NEXT: LBB24_47: ## %cond.load67 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX1-NEXT: je LBB24_50 +; AVX1-NEXT: LBB24_49: ## %cond.load70 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX1-NEXT: je LBB24_52 +; AVX1-NEXT: LBB24_51: ## %cond.load73 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX1-NEXT: je LBB24_54 +; AVX1-NEXT: LBB24_53: ## %cond.load76 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX1-NEXT: je LBB24_56 +; AVX1-NEXT: LBB24_55: ## %cond.load79 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX1-NEXT: je LBB24_58 +; AVX1-NEXT: LBB24_57: ## %cond.load82 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX1-NEXT: je LBB24_60 +; AVX1-NEXT: LBB24_59: ## %cond.load85 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX1-NEXT: je LBB24_62 +; AVX1-NEXT: LBB24_61: ## %cond.load88 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX1-NEXT: je LBB24_64 -; AVX1-NEXT: ## %bb.63: ## %cond.load91 +; AVX1-NEXT: LBB24_63: ## %cond.load91 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: LBB24_64: ## %else92 ; AVX1-NEXT: vmovaps %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v32i8_v32i8: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_2 -; AVX2-NEXT: ## %bb.1: ## %cond.load -; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: LBB24_2: ## %else -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_4 -; AVX2-NEXT: ## %bb.3: ## %cond.load1 -; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: jne LBB24_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB24_3 ; AVX2-NEXT: LBB24_4: ## %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_6 -; AVX2-NEXT: ## %bb.5: ## %cond.load4 -; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB24_5 ; AVX2-NEXT: LBB24_6: ## %else5 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_8 -; AVX2-NEXT: ## %bb.7: ## %cond.load7 -; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB24_7 ; AVX2-NEXT: LBB24_8: ## %else8 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_10 -; AVX2-NEXT: ## %bb.9: ## %cond.load10 -; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne LBB24_9 ; AVX2-NEXT: LBB24_10: ## %else11 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_12 -; AVX2-NEXT: ## %bb.11: ## %cond.load13 -; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne LBB24_11 ; AVX2-NEXT: LBB24_12: ## %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_14 -; AVX2-NEXT: ## %bb.13: ## %cond.load16 -; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne LBB24_13 ; AVX2-NEXT: LBB24_14: ## %else17 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_16 -; AVX2-NEXT: ## %bb.15: ## %cond.load19 -; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne LBB24_15 ; AVX2-NEXT: LBB24_16: ## %else20 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_18 -; AVX2-NEXT: ## %bb.17: ## %cond.load22 -; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: jne LBB24_17 ; AVX2-NEXT: LBB24_18: ## %else23 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_20 -; AVX2-NEXT: ## %bb.19: ## %cond.load25 -; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: jne LBB24_19 ; AVX2-NEXT: LBB24_20: ## %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_22 -; AVX2-NEXT: ## %bb.21: ## %cond.load28 -; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: jne LBB24_21 ; AVX2-NEXT: LBB24_22: ## %else29 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_24 -; AVX2-NEXT: ## %bb.23: ## %cond.load31 -; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: jne LBB24_23 ; AVX2-NEXT: LBB24_24: ## %else32 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_26 -; AVX2-NEXT: ## %bb.25: ## %cond.load34 -; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: jne LBB24_25 ; AVX2-NEXT: LBB24_26: ## %else35 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_28 -; AVX2-NEXT: ## %bb.27: ## %cond.load37 -; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: jne LBB24_27 ; AVX2-NEXT: LBB24_28: ## %else38 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_30 -; AVX2-NEXT: ## %bb.29: ## %cond.load40 -; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: jne LBB24_29 ; AVX2-NEXT: LBB24_30: ## %else41 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_32 -; AVX2-NEXT: ## %bb.31: ## %cond.load43 -; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: jne LBB24_31 ; AVX2-NEXT: LBB24_32: ## %else44 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_34 -; AVX2-NEXT: ## %bb.33: ## %cond.load46 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX2-NEXT: jne LBB24_33 ; AVX2-NEXT: LBB24_34: ## %else47 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_36 -; AVX2-NEXT: ## %bb.35: ## %cond.load49 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX2-NEXT: jne LBB24_35 ; AVX2-NEXT: LBB24_36: ## %else50 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_38 -; AVX2-NEXT: ## %bb.37: ## %cond.load52 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX2-NEXT: jne LBB24_37 ; AVX2-NEXT: LBB24_38: ## %else53 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_40 -; AVX2-NEXT: ## %bb.39: ## %cond.load55 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX2-NEXT: jne LBB24_39 ; AVX2-NEXT: LBB24_40: ## %else56 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_42 -; AVX2-NEXT: ## %bb.41: ## %cond.load58 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX2-NEXT: jne LBB24_41 ; AVX2-NEXT: LBB24_42: ## %else59 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_44 -; AVX2-NEXT: ## %bb.43: ## %cond.load61 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX2-NEXT: jne LBB24_43 ; AVX2-NEXT: LBB24_44: ## %else62 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_46 -; AVX2-NEXT: ## %bb.45: ## %cond.load64 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX2-NEXT: jne LBB24_45 ; AVX2-NEXT: LBB24_46: ## %else65 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_48 -; AVX2-NEXT: ## %bb.47: ## %cond.load67 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX2-NEXT: jne LBB24_47 ; AVX2-NEXT: LBB24_48: ## %else68 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_50 -; AVX2-NEXT: ## %bb.49: ## %cond.load70 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX2-NEXT: jne LBB24_49 ; AVX2-NEXT: LBB24_50: ## %else71 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_52 -; AVX2-NEXT: ## %bb.51: ## %cond.load73 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX2-NEXT: jne LBB24_51 ; AVX2-NEXT: LBB24_52: ## %else74 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_54 -; AVX2-NEXT: ## %bb.53: ## %cond.load76 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX2-NEXT: jne LBB24_53 ; AVX2-NEXT: LBB24_54: ## %else77 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_56 -; AVX2-NEXT: ## %bb.55: ## %cond.load79 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX2-NEXT: jne LBB24_55 ; AVX2-NEXT: LBB24_56: ## %else80 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_58 -; AVX2-NEXT: ## %bb.57: ## %cond.load82 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX2-NEXT: jne LBB24_57 ; AVX2-NEXT: LBB24_58: ## %else83 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_60 -; AVX2-NEXT: ## %bb.59: ## %cond.load85 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX2-NEXT: jne LBB24_59 ; AVX2-NEXT: LBB24_60: ## %else86 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB24_62 -; AVX2-NEXT: ## %bb.61: ## %cond.load88 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX2-NEXT: jne LBB24_61 ; AVX2-NEXT: LBB24_62: ## %else89 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX2-NEXT: jne LBB24_63 +; AVX2-NEXT: LBB24_64: ## %else92 +; AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB24_1: ## %cond.load +; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB24_4 +; AVX2-NEXT: LBB24_3: ## %cond.load1 +; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: je LBB24_6 +; AVX2-NEXT: LBB24_5: ## %cond.load4 +; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je LBB24_8 +; AVX2-NEXT: LBB24_7: ## %cond.load7 +; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: je LBB24_10 +; AVX2-NEXT: LBB24_9: ## %cond.load10 +; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: je LBB24_12 +; AVX2-NEXT: LBB24_11: ## %cond.load13 +; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: je LBB24_14 +; AVX2-NEXT: LBB24_13: ## %cond.load16 +; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je LBB24_16 +; AVX2-NEXT: LBB24_15: ## %cond.load19 +; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: je LBB24_18 +; AVX2-NEXT: LBB24_17: ## %cond.load22 +; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: je LBB24_20 +; AVX2-NEXT: LBB24_19: ## %cond.load25 +; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: je LBB24_22 +; AVX2-NEXT: LBB24_21: ## %cond.load28 +; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: je LBB24_24 +; AVX2-NEXT: LBB24_23: ## %cond.load31 +; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: je LBB24_26 +; AVX2-NEXT: LBB24_25: ## %cond.load34 +; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: je LBB24_28 +; AVX2-NEXT: LBB24_27: ## %cond.load37 +; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: je LBB24_30 +; AVX2-NEXT: LBB24_29: ## %cond.load40 +; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: je LBB24_32 +; AVX2-NEXT: LBB24_31: ## %cond.load43 +; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX2-NEXT: je LBB24_34 +; AVX2-NEXT: LBB24_33: ## %cond.load46 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX2-NEXT: je LBB24_36 +; AVX2-NEXT: LBB24_35: ## %cond.load49 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX2-NEXT: je LBB24_38 +; AVX2-NEXT: LBB24_37: ## %cond.load52 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX2-NEXT: je LBB24_40 +; AVX2-NEXT: LBB24_39: ## %cond.load55 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX2-NEXT: je LBB24_42 +; AVX2-NEXT: LBB24_41: ## %cond.load58 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX2-NEXT: je LBB24_44 +; AVX2-NEXT: LBB24_43: ## %cond.load61 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX2-NEXT: je LBB24_46 +; AVX2-NEXT: LBB24_45: ## %cond.load64 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX2-NEXT: je LBB24_48 +; AVX2-NEXT: LBB24_47: ## %cond.load67 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX2-NEXT: je LBB24_50 +; AVX2-NEXT: LBB24_49: ## %cond.load70 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX2-NEXT: je LBB24_52 +; AVX2-NEXT: LBB24_51: ## %cond.load73 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX2-NEXT: je LBB24_54 +; AVX2-NEXT: LBB24_53: ## %cond.load76 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX2-NEXT: je LBB24_56 +; AVX2-NEXT: LBB24_55: ## %cond.load79 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX2-NEXT: je LBB24_58 +; AVX2-NEXT: LBB24_57: ## %cond.load82 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX2-NEXT: je LBB24_60 +; AVX2-NEXT: LBB24_59: ## %cond.load85 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX2-NEXT: je LBB24_62 +; AVX2-NEXT: LBB24_61: ## %cond.load88 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB24_64 -; AVX2-NEXT: ## %bb.63: ## %cond.load91 +; AVX2-NEXT: LBB24_63: ## %cond.load91 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: LBB24_64: ## %else92 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v32i8_v32i8: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %ymm0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.load -; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: LBB24_2: ## %else -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.load1 -; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: jne LBB24_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB24_3 ; AVX512F-NEXT: LBB24_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.load4 -; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB24_5 ; AVX512F-NEXT: LBB24_6: ## %else5 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.load7 -; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB24_7 ; AVX512F-NEXT: LBB24_8: ## %else8 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.load10 -; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB24_9 ; AVX512F-NEXT: LBB24_10: ## %else11 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.load13 -; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB24_11 ; AVX512F-NEXT: LBB24_12: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.load16 -; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB24_13 ; AVX512F-NEXT: LBB24_14: ## %else17 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.load19 -; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB24_15 ; AVX512F-NEXT: LBB24_16: ## %else20 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.load22 -; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: jne LBB24_17 ; AVX512F-NEXT: LBB24_18: ## %else23 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.load25 -; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB24_19 ; AVX512F-NEXT: LBB24_20: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.load28 -; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB24_21 ; AVX512F-NEXT: LBB24_22: ## %else29 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.load31 -; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB24_23 ; AVX512F-NEXT: LBB24_24: ## %else32 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.load34 -; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB24_25 ; AVX512F-NEXT: LBB24_26: ## %else35 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.load37 -; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB24_27 ; AVX512F-NEXT: LBB24_28: ## %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.load40 -; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB24_29 ; AVX512F-NEXT: LBB24_30: ## %else41 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.load43 -; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB24_31 ; AVX512F-NEXT: LBB24_32: ## %else44 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_34 -; AVX512F-NEXT: ## %bb.33: ## %cond.load46 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512F-NEXT: jne LBB24_33 ; AVX512F-NEXT: LBB24_34: ## %else47 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_36 -; AVX512F-NEXT: ## %bb.35: ## %cond.load49 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512F-NEXT: jne LBB24_35 ; AVX512F-NEXT: LBB24_36: ## %else50 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_38 -; AVX512F-NEXT: ## %bb.37: ## %cond.load52 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512F-NEXT: jne LBB24_37 ; AVX512F-NEXT: LBB24_38: ## %else53 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_40 -; AVX512F-NEXT: ## %bb.39: ## %cond.load55 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512F-NEXT: jne LBB24_39 ; AVX512F-NEXT: LBB24_40: ## %else56 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_42 -; AVX512F-NEXT: ## %bb.41: ## %cond.load58 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512F-NEXT: jne LBB24_41 ; AVX512F-NEXT: LBB24_42: ## %else59 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_44 -; AVX512F-NEXT: ## %bb.43: ## %cond.load61 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512F-NEXT: jne LBB24_43 ; AVX512F-NEXT: LBB24_44: ## %else62 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_46 -; AVX512F-NEXT: ## %bb.45: ## %cond.load64 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512F-NEXT: jne LBB24_45 ; AVX512F-NEXT: LBB24_46: ## %else65 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_48 -; AVX512F-NEXT: ## %bb.47: ## %cond.load67 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512F-NEXT: jne LBB24_47 ; AVX512F-NEXT: LBB24_48: ## %else68 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_50 -; AVX512F-NEXT: ## %bb.49: ## %cond.load70 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512F-NEXT: jne LBB24_49 ; AVX512F-NEXT: LBB24_50: ## %else71 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_52 -; AVX512F-NEXT: ## %bb.51: ## %cond.load73 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512F-NEXT: jne LBB24_51 ; AVX512F-NEXT: LBB24_52: ## %else74 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_54 -; AVX512F-NEXT: ## %bb.53: ## %cond.load76 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512F-NEXT: jne LBB24_53 ; AVX512F-NEXT: LBB24_54: ## %else77 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_56 -; AVX512F-NEXT: ## %bb.55: ## %cond.load79 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512F-NEXT: jne LBB24_55 ; AVX512F-NEXT: LBB24_56: ## %else80 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_58 -; AVX512F-NEXT: ## %bb.57: ## %cond.load82 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512F-NEXT: jne LBB24_57 ; AVX512F-NEXT: LBB24_58: ## %else83 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB24_60 -; AVX512F-NEXT: ## %bb.59: ## %cond.load85 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512F-NEXT: jne LBB24_59 ; AVX512F-NEXT: LBB24_60: ## %else86 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX512F-NEXT: jne LBB24_61 +; AVX512F-NEXT: LBB24_62: ## %else89 +; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX512F-NEXT: jne LBB24_63 +; AVX512F-NEXT: LBB24_64: ## %else92 +; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB24_1: ## %cond.load +; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: je LBB24_4 +; AVX512F-NEXT: LBB24_3: ## %cond.load1 +; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: je LBB24_6 +; AVX512F-NEXT: LBB24_5: ## %cond.load4 +; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: je LBB24_8 +; AVX512F-NEXT: LBB24_7: ## %cond.load7 +; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: je LBB24_10 +; AVX512F-NEXT: LBB24_9: ## %cond.load10 +; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: je LBB24_12 +; AVX512F-NEXT: LBB24_11: ## %cond.load13 +; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: je LBB24_14 +; AVX512F-NEXT: LBB24_13: ## %cond.load16 +; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je LBB24_16 +; AVX512F-NEXT: LBB24_15: ## %cond.load19 +; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: je LBB24_18 +; AVX512F-NEXT: LBB24_17: ## %cond.load22 +; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: je LBB24_20 +; AVX512F-NEXT: LBB24_19: ## %cond.load25 +; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: je LBB24_22 +; AVX512F-NEXT: LBB24_21: ## %cond.load28 +; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: je LBB24_24 +; AVX512F-NEXT: LBB24_23: ## %cond.load31 +; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: je LBB24_26 +; AVX512F-NEXT: LBB24_25: ## %cond.load34 +; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: je LBB24_28 +; AVX512F-NEXT: LBB24_27: ## %cond.load37 +; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: je LBB24_30 +; AVX512F-NEXT: LBB24_29: ## %cond.load40 +; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: je LBB24_32 +; AVX512F-NEXT: LBB24_31: ## %cond.load43 +; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512F-NEXT: je LBB24_34 +; AVX512F-NEXT: LBB24_33: ## %cond.load46 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512F-NEXT: je LBB24_36 +; AVX512F-NEXT: LBB24_35: ## %cond.load49 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512F-NEXT: je LBB24_38 +; AVX512F-NEXT: LBB24_37: ## %cond.load52 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512F-NEXT: je LBB24_40 +; AVX512F-NEXT: LBB24_39: ## %cond.load55 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512F-NEXT: je LBB24_42 +; AVX512F-NEXT: LBB24_41: ## %cond.load58 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512F-NEXT: je LBB24_44 +; AVX512F-NEXT: LBB24_43: ## %cond.load61 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512F-NEXT: je LBB24_46 +; AVX512F-NEXT: LBB24_45: ## %cond.load64 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512F-NEXT: je LBB24_48 +; AVX512F-NEXT: LBB24_47: ## %cond.load67 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512F-NEXT: je LBB24_50 +; AVX512F-NEXT: LBB24_49: ## %cond.load70 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512F-NEXT: je LBB24_52 +; AVX512F-NEXT: LBB24_51: ## %cond.load73 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512F-NEXT: je LBB24_54 +; AVX512F-NEXT: LBB24_53: ## %cond.load76 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512F-NEXT: je LBB24_56 +; AVX512F-NEXT: LBB24_55: ## %cond.load79 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512F-NEXT: je LBB24_58 +; AVX512F-NEXT: LBB24_57: ## %cond.load82 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512F-NEXT: je LBB24_60 +; AVX512F-NEXT: LBB24_59: ## %cond.load85 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512F-NEXT: je LBB24_62 -; AVX512F-NEXT: ## %bb.61: ## %cond.load88 +; AVX512F-NEXT: LBB24_61: ## %cond.load88 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: LBB24_62: ## %else89 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512F-NEXT: je LBB24_64 -; AVX512F-NEXT: ## %bb.63: ## %cond.load91 +; AVX512F-NEXT: LBB24_63: ## %cond.load91 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: LBB24_64: ## %else92 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_v32i8_v32i8: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load -; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512VLDQ-NEXT: LBB24_2: ## %else -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 -; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: jne LBB24_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB24_3 ; AVX512VLDQ-NEXT: LBB24_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 -; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB24_5 ; AVX512VLDQ-NEXT: LBB24_6: ## %else5 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 -; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB24_7 ; AVX512VLDQ-NEXT: LBB24_8: ## %else8 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 -; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB24_9 ; AVX512VLDQ-NEXT: LBB24_10: ## %else11 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 -; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB24_11 ; AVX512VLDQ-NEXT: LBB24_12: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 -; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB24_13 ; AVX512VLDQ-NEXT: LBB24_14: ## %else17 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 -; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB24_15 ; AVX512VLDQ-NEXT: LBB24_16: ## %else20 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 -; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: jne LBB24_17 ; AVX512VLDQ-NEXT: LBB24_18: ## %else23 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 -; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB24_19 ; AVX512VLDQ-NEXT: LBB24_20: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 -; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB24_21 ; AVX512VLDQ-NEXT: LBB24_22: ## %else29 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 -; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB24_23 ; AVX512VLDQ-NEXT: LBB24_24: ## %else32 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 -; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB24_25 ; AVX512VLDQ-NEXT: LBB24_26: ## %else35 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 -; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB24_27 ; AVX512VLDQ-NEXT: LBB24_28: ## %else38 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 -; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm3 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB24_29 ; AVX512VLDQ-NEXT: LBB24_30: ## %else41 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 -; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 -; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB24_31 ; AVX512VLDQ-NEXT: LBB24_32: ## %else44 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_34 -; AVX512VLDQ-NEXT: ## %bb.33: ## %cond.load46 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512VLDQ-NEXT: jne LBB24_33 ; AVX512VLDQ-NEXT: LBB24_34: ## %else47 -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_36 -; AVX512VLDQ-NEXT: ## %bb.35: ## %cond.load49 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512VLDQ-NEXT: jne LBB24_35 ; AVX512VLDQ-NEXT: LBB24_36: ## %else50 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_38 -; AVX512VLDQ-NEXT: ## %bb.37: ## %cond.load52 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512VLDQ-NEXT: jne LBB24_37 ; AVX512VLDQ-NEXT: LBB24_38: ## %else53 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_40 -; AVX512VLDQ-NEXT: ## %bb.39: ## %cond.load55 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512VLDQ-NEXT: jne LBB24_39 ; AVX512VLDQ-NEXT: LBB24_40: ## %else56 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_42 -; AVX512VLDQ-NEXT: ## %bb.41: ## %cond.load58 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512VLDQ-NEXT: jne LBB24_41 ; AVX512VLDQ-NEXT: LBB24_42: ## %else59 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_44 -; AVX512VLDQ-NEXT: ## %bb.43: ## %cond.load61 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512VLDQ-NEXT: jne LBB24_43 ; AVX512VLDQ-NEXT: LBB24_44: ## %else62 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_46 -; AVX512VLDQ-NEXT: ## %bb.45: ## %cond.load64 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512VLDQ-NEXT: jne LBB24_45 ; AVX512VLDQ-NEXT: LBB24_46: ## %else65 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_48 -; AVX512VLDQ-NEXT: ## %bb.47: ## %cond.load67 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512VLDQ-NEXT: jne LBB24_47 ; AVX512VLDQ-NEXT: LBB24_48: ## %else68 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_50 -; AVX512VLDQ-NEXT: ## %bb.49: ## %cond.load70 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512VLDQ-NEXT: jne LBB24_49 ; AVX512VLDQ-NEXT: LBB24_50: ## %else71 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_52 -; AVX512VLDQ-NEXT: ## %bb.51: ## %cond.load73 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512VLDQ-NEXT: jne LBB24_51 ; AVX512VLDQ-NEXT: LBB24_52: ## %else74 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_54 -; AVX512VLDQ-NEXT: ## %bb.53: ## %cond.load76 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512VLDQ-NEXT: jne LBB24_53 ; AVX512VLDQ-NEXT: LBB24_54: ## %else77 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_56 -; AVX512VLDQ-NEXT: ## %bb.55: ## %cond.load79 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512VLDQ-NEXT: jne LBB24_55 ; AVX512VLDQ-NEXT: LBB24_56: ## %else80 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_58 -; AVX512VLDQ-NEXT: ## %bb.57: ## %cond.load82 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512VLDQ-NEXT: jne LBB24_57 ; AVX512VLDQ-NEXT: LBB24_58: ## %else83 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB24_60 -; AVX512VLDQ-NEXT: ## %bb.59: ## %cond.load85 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512VLDQ-NEXT: jne LBB24_59 ; AVX512VLDQ-NEXT: LBB24_60: ## %else86 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX512VLDQ-NEXT: jne LBB24_61 +; AVX512VLDQ-NEXT: LBB24_62: ## %else89 +; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX512VLDQ-NEXT: jne LBB24_63 +; AVX512VLDQ-NEXT: LBB24_64: ## %else92 +; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB24_1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: je LBB24_4 +; AVX512VLDQ-NEXT: LBB24_3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: je LBB24_6 +; AVX512VLDQ-NEXT: LBB24_5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: je LBB24_8 +; AVX512VLDQ-NEXT: LBB24_7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: je LBB24_10 +; AVX512VLDQ-NEXT: LBB24_9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: je LBB24_12 +; AVX512VLDQ-NEXT: LBB24_11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: je LBB24_14 +; AVX512VLDQ-NEXT: LBB24_13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: je LBB24_16 +; AVX512VLDQ-NEXT: LBB24_15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: je LBB24_18 +; AVX512VLDQ-NEXT: LBB24_17: ## %cond.load22 +; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: je LBB24_20 +; AVX512VLDQ-NEXT: LBB24_19: ## %cond.load25 +; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: je LBB24_22 +; AVX512VLDQ-NEXT: LBB24_21: ## %cond.load28 +; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: je LBB24_24 +; AVX512VLDQ-NEXT: LBB24_23: ## %cond.load31 +; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: je LBB24_26 +; AVX512VLDQ-NEXT: LBB24_25: ## %cond.load34 +; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: je LBB24_28 +; AVX512VLDQ-NEXT: LBB24_27: ## %cond.load37 +; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: je LBB24_30 +; AVX512VLDQ-NEXT: LBB24_29: ## %cond.load40 +; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: je LBB24_32 +; AVX512VLDQ-NEXT: LBB24_31: ## %cond.load43 +; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512VLDQ-NEXT: je LBB24_34 +; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512VLDQ-NEXT: je LBB24_36 +; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512VLDQ-NEXT: je LBB24_38 +; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512VLDQ-NEXT: je LBB24_40 +; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512VLDQ-NEXT: je LBB24_42 +; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512VLDQ-NEXT: je LBB24_44 +; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512VLDQ-NEXT: je LBB24_46 +; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512VLDQ-NEXT: je LBB24_48 +; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512VLDQ-NEXT: je LBB24_50 +; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512VLDQ-NEXT: je LBB24_52 +; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512VLDQ-NEXT: je LBB24_54 +; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512VLDQ-NEXT: je LBB24_56 +; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512VLDQ-NEXT: je LBB24_58 +; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512VLDQ-NEXT: je LBB24_60 +; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512VLDQ-NEXT: je LBB24_62 -; AVX512VLDQ-NEXT: ## %bb.61: ## %cond.load88 +; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512VLDQ-NEXT: LBB24_62: ## %else89 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512VLDQ-NEXT: je LBB24_64 -; AVX512VLDQ-NEXT: ## %bb.63: ## %cond.load91 +; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512VLDQ-NEXT: LBB24_64: ## %else92 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/masked_store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_store.ll +++ llvm/trunk/test/CodeGen/X86/masked_store.ll @@ -35,48 +35,23 @@ } define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) { -; SSE2-LABEL: store_v2f64_v2i64: -; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB1_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlps %xmm1, (%rdi) -; SSE2-NEXT: LBB1_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB1_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhps %xmm1, 8(%rdi) -; SSE2-NEXT: LBB1_4: ## %else2 -; SSE2-NEXT: retq -; -; SSE4-LABEL: store_v2f64_v2i64: -; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB1_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlps %xmm1, (%rdi) -; SSE4-NEXT: LBB1_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB1_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhps %xmm1, 8(%rdi) -; SSE4-NEXT: LBB1_4: ## %else2 -; SSE4-NEXT: retq +; SSE-LABEL: store_v2f64_v2i64: +; SSE: ## %bb.0: +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB1_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB1_3 +; SSE-NEXT: LBB1_4: ## %else2 +; SSE-NEXT: retq +; SSE-NEXT: LBB1_1: ## %cond.store +; SSE-NEXT: movlps %xmm1, (%rdi) +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB1_4 +; SSE-NEXT: LBB1_3: ## %cond.store1 +; SSE-NEXT: movhps %xmm1, 8(%rdi) +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: store_v2f64_v2i64: ; AVX1OR2: ## %bb.0: @@ -113,82 +88,38 @@ } define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x double> %val) { -; SSE2-LABEL: store_v4f64_v4i64: -; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlps %xmm2, (%rdi) -; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhps %xmm2, 8(%rdi) -; SSE2-NEXT: LBB2_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movlps %xmm3, 16(%rdi) -; SSE2-NEXT: LBB2_6: ## %else4 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB2_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: movhps %xmm3, 24(%rdi) -; SSE2-NEXT: LBB2_8: ## %else6 -; SSE2-NEXT: retq -; -; SSE4-LABEL: store_v4f64_v4i64: -; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm4 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB2_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlps %xmm2, (%rdi) -; SSE4-NEXT: LBB2_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB2_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhps %xmm2, 8(%rdi) -; SSE4-NEXT: LBB2_4: ## %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB2_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 -; SSE4-NEXT: movlps %xmm3, 16(%rdi) -; SSE4-NEXT: LBB2_6: ## %else4 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB2_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 -; SSE4-NEXT: movhps %xmm3, 24(%rdi) -; SSE4-NEXT: LBB2_8: ## %else6 -; SSE4-NEXT: retq +; SSE-LABEL: store_v4f64_v4i64: +; SSE: ## %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB2_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB2_3 +; SSE-NEXT: LBB2_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB2_5 +; SSE-NEXT: LBB2_6: ## %else4 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB2_7 +; SSE-NEXT: LBB2_8: ## %else6 +; SSE-NEXT: retq +; SSE-NEXT: LBB2_1: ## %cond.store +; SSE-NEXT: movlps %xmm2, (%rdi) +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB2_4 +; SSE-NEXT: LBB2_3: ## %cond.store1 +; SSE-NEXT: movhps %xmm2, 8(%rdi) +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB2_6 +; SSE-NEXT: LBB2_5: ## %cond.store3 +; SSE-NEXT: movlps %xmm3, 16(%rdi) +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB2_8 +; SSE-NEXT: LBB2_7: ## %cond.store5 +; SSE-NEXT: movhps %xmm3, 24(%rdi) +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: store_v4f64_v4i64: ; AVX1OR2: ## %bb.0: @@ -239,19 +170,21 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB3_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB3_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB3_3 +; SSE2-NEXT: LBB3_4: ## %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB3_1: ## %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) -; SSE2-NEXT: LBB3_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB3_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB3_3: ## %cond.store1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE2-NEXT: movss %xmm1, 4(%rdi) -; SSE2-NEXT: LBB3_4: ## %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v2f32_v2i32: @@ -259,18 +192,20 @@ ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE4-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax +; SSE4-NEXT: movmskpd %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB3_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB3_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB3_3 +; SSE4-NEXT: LBB3_4: ## %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB3_1: ## %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: LBB3_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB3_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB3_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) -; SSE4-NEXT: LBB3_4: ## %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v2f32_v2i32: @@ -318,71 +253,71 @@ define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { ; SSE2-LABEL: store_v4f32_v4i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB4_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB4_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB4_3 +; SSE2-NEXT: LBB4_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB4_5 +; SSE2-NEXT: LBB4_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB4_7 +; SSE2-NEXT: LBB4_8: ## %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB4_1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: LBB4_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB4_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB4_3: ## %cond.store1 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; SSE2-NEXT: movss %xmm1, 4(%rdi) -; SSE2-NEXT: LBB4_4: ## %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB4_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE2-NEXT: movss %xmm2, 8(%rdi) -; SSE2-NEXT: LBB4_6: ## %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB4_5: ## %cond.store3 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movss %xmm1, 8(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB4_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB4_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: LBB4_8: ## %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v4f32_v4i32: ; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax +; SSE4-NEXT: movmskps %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB4_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB4_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB4_3 +; SSE4-NEXT: LBB4_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB4_5 +; SSE4-NEXT: LBB4_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB4_7 +; SSE4-NEXT: LBB4_8: ## %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB4_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: LBB4_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB4_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB4_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) -; SSE4-NEXT: LBB4_4: ## %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB4_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB4_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) -; SSE4-NEXT: LBB4_6: ## %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB4_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB4_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) -; SSE4-NEXT: LBB4_8: ## %else6 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: store_v4f32_v4i32: @@ -422,136 +357,136 @@ define void @store_v8f32_v8i32(<8 x float> %x, <8 x float>* %ptr, <8 x float> %y, <8 x i32> %mask) { ; SSE2-LABEL: store_v8f32_v8i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: packssdw %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB5_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB5_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB5_3 +; SSE2-NEXT: LBB5_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB5_5 +; SSE2-NEXT: LBB5_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB5_7 +; SSE2-NEXT: LBB5_8: ## %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB5_9 +; SSE2-NEXT: LBB5_10: ## %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB5_11 +; SSE2-NEXT: LBB5_12: ## %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB5_13 +; SSE2-NEXT: LBB5_14: ## %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB5_15 +; SSE2-NEXT: LBB5_16: ## %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB5_1: ## %cond.store ; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: LBB5_2: ## %else -; SSE2-NEXT: psrlq $16, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB5_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB5_3: ## %cond.store1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] ; SSE2-NEXT: movss %xmm2, 4(%rdi) -; SSE2-NEXT: LBB5_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB5_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE2-NEXT: movd %xmm3, 8(%rdi) -; SSE2-NEXT: LBB5_6: ## %else4 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB5_5: ## %cond.store3 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movss %xmm2, 8(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB5_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB5_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: LBB5_8: ## %else6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB5_10 -; SSE2-NEXT: ## %bb.9: ## %cond.store7 +; SSE2-NEXT: LBB5_9: ## %cond.store7 ; SSE2-NEXT: movss %xmm1, 16(%rdi) -; SSE2-NEXT: LBB5_10: ## %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB5_12 -; SSE2-NEXT: ## %bb.11: ## %cond.store9 +; SSE2-NEXT: LBB5_11: ## %cond.store9 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: movss %xmm0, 20(%rdi) -; SSE2-NEXT: LBB5_12: ## %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB5_14 -; SSE2-NEXT: ## %bb.13: ## %cond.store11 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movss %xmm2, 24(%rdi) -; SSE2-NEXT: LBB5_14: ## %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB5_13: ## %cond.store11 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: movss %xmm0, 24(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB5_16 -; SSE2-NEXT: ## %bb.15: ## %cond.store13 +; SSE2-NEXT: LBB5_15: ## %cond.store13 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE2-NEXT: movss %xmm1, 28(%rdi) -; SSE2-NEXT: LBB5_16: ## %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v8f32_v8i32: ; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB5_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB5_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB5_3 +; SSE4-NEXT: LBB5_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB5_5 +; SSE4-NEXT: LBB5_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB5_7 +; SSE4-NEXT: LBB5_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB5_9 +; SSE4-NEXT: LBB5_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB5_11 +; SSE4-NEXT: LBB5_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB5_13 +; SSE4-NEXT: LBB5_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB5_15 +; SSE4-NEXT: LBB5_16: ## %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB5_1: ## %cond.store ; SSE4-NEXT: movd %xmm0, (%rdi) -; SSE4-NEXT: LBB5_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB5_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB5_3: ## %cond.store1 ; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) -; SSE4-NEXT: LBB5_4: ## %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB5_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB5_5: ## %cond.store3 ; SSE4-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE4-NEXT: LBB5_6: ## %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB5_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB5_7: ## %cond.store5 ; SSE4-NEXT: pextrd $3, %xmm0, 12(%rdi) -; SSE4-NEXT: LBB5_8: ## %else6 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB5_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB5_9: ## %cond.store7 ; SSE4-NEXT: movss %xmm1, 16(%rdi) -; SSE4-NEXT: LBB5_10: ## %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB5_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB5_11: ## %cond.store9 ; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi) -; SSE4-NEXT: LBB5_12: ## %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB5_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB5_13: ## %cond.store11 ; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi) -; SSE4-NEXT: LBB5_14: ## %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB5_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB5_15: ## %cond.store13 ; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi) -; SSE4-NEXT: LBB5_16: ## %else14 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: store_v8f32_v8i32: @@ -595,261 +530,263 @@ ; SSE2-LABEL: store_v16f32_v16i32: ; SSE2: ## %bb.0: ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: movd %xmm5, %eax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: packsswb %xmm5, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB6_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB6_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB6_3 +; SSE2-NEXT: LBB6_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB6_5 +; SSE2-NEXT: LBB6_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB6_7 +; SSE2-NEXT: LBB6_8: ## %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB6_9 +; SSE2-NEXT: LBB6_10: ## %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB6_11 +; SSE2-NEXT: LBB6_12: ## %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB6_13 +; SSE2-NEXT: LBB6_14: ## %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB6_15 +; SSE2-NEXT: LBB6_16: ## %else14 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: jne LBB6_17 +; SSE2-NEXT: LBB6_18: ## %else16 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 +; SSE2-NEXT: jne LBB6_19 +; SSE2-NEXT: LBB6_20: ## %else18 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: jne LBB6_21 +; SSE2-NEXT: LBB6_22: ## %else20 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE2-NEXT: jne LBB6_23 +; SSE2-NEXT: LBB6_24: ## %else22 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: jne LBB6_25 +; SSE2-NEXT: LBB6_26: ## %else24 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE2-NEXT: jne LBB6_27 +; SSE2-NEXT: LBB6_28: ## %else26 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: jne LBB6_29 +; SSE2-NEXT: LBB6_30: ## %else28 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: jne LBB6_31 +; SSE2-NEXT: LBB6_32: ## %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB6_1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: LBB6_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB6_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3] -; SSE2-NEXT: movss %xmm5, 4(%rdi) -; SSE2-NEXT: LBB6_4: ## %else2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB6_3: ## %cond.store1 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3] +; SSE2-NEXT: movss %xmm4, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB6_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 +; SSE2-NEXT: LBB6_5: ## %cond.store3 ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE2-NEXT: movss %xmm4, 8(%rdi) -; SSE2-NEXT: LBB6_6: ## %else4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB6_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB6_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: LBB6_8: ## %else6 -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB6_10 -; SSE2-NEXT: ## %bb.9: ## %cond.store7 +; SSE2-NEXT: LBB6_9: ## %cond.store7 ; SSE2-NEXT: movss %xmm1, 16(%rdi) -; SSE2-NEXT: LBB6_10: ## %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB6_12 -; SSE2-NEXT: ## %bb.11: ## %cond.store9 +; SSE2-NEXT: LBB6_11: ## %cond.store9 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: movss %xmm0, 20(%rdi) -; SSE2-NEXT: LBB6_12: ## %else10 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB6_14 -; SSE2-NEXT: ## %bb.13: ## %cond.store11 +; SSE2-NEXT: LBB6_13: ## %cond.store11 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE2-NEXT: movss %xmm0, 24(%rdi) -; SSE2-NEXT: LBB6_14: ## %else12 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB6_16 -; SSE2-NEXT: ## %bb.15: ## %cond.store13 +; SSE2-NEXT: LBB6_15: ## %cond.store13 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE2-NEXT: movss %xmm1, 28(%rdi) -; SSE2-NEXT: LBB6_16: ## %else14 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB6_18 -; SSE2-NEXT: ## %bb.17: ## %cond.store15 +; SSE2-NEXT: LBB6_17: ## %cond.store15 ; SSE2-NEXT: movss %xmm2, 32(%rdi) -; SSE2-NEXT: LBB6_18: ## %else16 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 ; SSE2-NEXT: je LBB6_20 -; SSE2-NEXT: ## %bb.19: ## %cond.store17 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] -; SSE2-NEXT: movss %xmm1, 36(%rdi) -; SSE2-NEXT: LBB6_20: ## %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB6_19: ## %cond.store17 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] +; SSE2-NEXT: movss %xmm0, 36(%rdi) +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB6_22 -; SSE2-NEXT: ## %bb.21: ## %cond.store19 +; SSE2-NEXT: LBB6_21: ## %cond.store19 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE2-NEXT: movss %xmm0, 40(%rdi) -; SSE2-NEXT: LBB6_22: ## %else20 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB6_24 -; SSE2-NEXT: ## %bb.23: ## %cond.store21 +; SSE2-NEXT: LBB6_23: ## %cond.store21 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE2-NEXT: movss %xmm2, 44(%rdi) -; SSE2-NEXT: LBB6_24: ## %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB6_26 -; SSE2-NEXT: ## %bb.25: ## %cond.store23 +; SSE2-NEXT: LBB6_25: ## %cond.store23 ; SSE2-NEXT: movss %xmm3, 48(%rdi) -; SSE2-NEXT: LBB6_26: ## %else24 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE2-NEXT: je LBB6_28 -; SSE2-NEXT: ## %bb.27: ## %cond.store25 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] -; SSE2-NEXT: movss %xmm1, 52(%rdi) -; SSE2-NEXT: LBB6_28: ## %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB6_27: ## %cond.store25 +; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[2,3] +; SSE2-NEXT: movss %xmm0, 52(%rdi) +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB6_30 -; SSE2-NEXT: ## %bb.29: ## %cond.store27 +; SSE2-NEXT: LBB6_29: ## %cond.store27 ; SSE2-NEXT: movaps %xmm3, %xmm0 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE2-NEXT: movss %xmm0, 56(%rdi) -; SSE2-NEXT: LBB6_30: ## %else28 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE2-NEXT: je LBB6_32 -; SSE2-NEXT: ## %bb.31: ## %cond.store29 +; SSE2-NEXT: LBB6_31: ## %cond.store29 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE2-NEXT: movss %xmm3, 60(%rdi) -; SSE2-NEXT: LBB6_32: ## %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v16f32_v16i32: ; SSE4: ## %bb.0: ; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE4-NEXT: pextrb $0, %xmm5, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movd %xmm0, (%rdi) -; SSE4-NEXT: LBB6_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm5, %eax +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE4-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm5 +; SSE4-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm4 +; SSE4-NEXT: packsswb %xmm5, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) +; SSE4-NEXT: jne LBB6_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB6_3 ; SSE4-NEXT: LBB6_4: ## %else2 -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE4-NEXT: pextrb $8, %xmm5, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 -; SSE4-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB6_5 ; SSE4-NEXT: LBB6_6: ## %else4 -; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE4-NEXT: pextrb $12, %xmm5, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 -; SSE4-NEXT: pextrd $3, %xmm0, 12(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB6_7 ; SSE4-NEXT: LBB6_8: ## %else6 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 -; SSE4-NEXT: movd %xmm1, 16(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB6_9 ; SSE4-NEXT: LBB6_10: ## %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 -; SSE4-NEXT: pextrd $1, %xmm1, 20(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB6_11 ; SSE4-NEXT: LBB6_12: ## %else10 -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE4-NEXT: pextrb $8, %xmm5, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 -; SSE4-NEXT: pextrd $2, %xmm1, 24(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB6_13 ; SSE4-NEXT: LBB6_14: ## %else12 -; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE4-NEXT: pextrb $12, %xmm5, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB6_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 -; SSE4-NEXT: pextrd $3, %xmm1, 28(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB6_15 ; SSE4-NEXT: LBB6_16: ## %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 +; SSE4-NEXT: jne LBB6_17 +; SSE4-NEXT: LBB6_18: ## %else16 +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 +; SSE4-NEXT: jne LBB6_19 +; SSE4-NEXT: LBB6_20: ## %else18 +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE4-NEXT: jne LBB6_21 +; SSE4-NEXT: LBB6_22: ## %else20 +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE4-NEXT: jne LBB6_23 +; SSE4-NEXT: LBB6_24: ## %else22 +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE4-NEXT: jne LBB6_25 +; SSE4-NEXT: LBB6_26: ## %else24 +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE4-NEXT: jne LBB6_27 +; SSE4-NEXT: LBB6_28: ## %else26 +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE4-NEXT: jne LBB6_29 +; SSE4-NEXT: LBB6_30: ## %else28 +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE4-NEXT: jne LBB6_31 +; SSE4-NEXT: LBB6_32: ## %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB6_1: ## %cond.store +; SSE4-NEXT: movss %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je LBB6_4 +; SSE4-NEXT: LBB6_3: ## %cond.store1 +; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je LBB6_6 +; SSE4-NEXT: LBB6_5: ## %cond.store3 +; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: je LBB6_8 +; SSE4-NEXT: LBB6_7: ## %cond.store5 +; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: je LBB6_10 +; SSE4-NEXT: LBB6_9: ## %cond.store7 +; SSE4-NEXT: movss %xmm1, 16(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je LBB6_12 +; SSE4-NEXT: LBB6_11: ## %cond.store9 +; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: je LBB6_14 +; SSE4-NEXT: LBB6_13: ## %cond.store11 +; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: je LBB6_16 +; SSE4-NEXT: LBB6_15: ## %cond.store13 +; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi) +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 ; SSE4-NEXT: je LBB6_18 -; SSE4-NEXT: ## %bb.17: ## %cond.store15 +; SSE4-NEXT: LBB6_17: ## %cond.store15 ; SSE4-NEXT: movss %xmm2, 32(%rdi) -; SSE4-NEXT: LBB6_18: ## %else16 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 ; SSE4-NEXT: je LBB6_20 -; SSE4-NEXT: ## %bb.19: ## %cond.store17 +; SSE4-NEXT: LBB6_19: ## %cond.store17 ; SSE4-NEXT: extractps $1, %xmm2, 36(%rdi) -; SSE4-NEXT: LBB6_20: ## %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE4-NEXT: je LBB6_22 -; SSE4-NEXT: ## %bb.21: ## %cond.store19 +; SSE4-NEXT: LBB6_21: ## %cond.store19 ; SSE4-NEXT: extractps $2, %xmm2, 40(%rdi) -; SSE4-NEXT: LBB6_22: ## %else20 -; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE4-NEXT: je LBB6_24 -; SSE4-NEXT: ## %bb.23: ## %cond.store21 +; SSE4-NEXT: LBB6_23: ## %cond.store21 ; SSE4-NEXT: extractps $3, %xmm2, 44(%rdi) -; SSE4-NEXT: LBB6_24: ## %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE4-NEXT: je LBB6_26 -; SSE4-NEXT: ## %bb.25: ## %cond.store23 +; SSE4-NEXT: LBB6_25: ## %cond.store23 ; SSE4-NEXT: movss %xmm3, 48(%rdi) -; SSE4-NEXT: LBB6_26: ## %else24 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE4-NEXT: je LBB6_28 -; SSE4-NEXT: ## %bb.27: ## %cond.store25 +; SSE4-NEXT: LBB6_27: ## %cond.store25 ; SSE4-NEXT: extractps $1, %xmm3, 52(%rdi) -; SSE4-NEXT: LBB6_28: ## %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE4-NEXT: je LBB6_30 -; SSE4-NEXT: ## %bb.29: ## %cond.store27 +; SSE4-NEXT: LBB6_29: ## %cond.store27 ; SSE4-NEXT: extractps $2, %xmm3, 56(%rdi) -; SSE4-NEXT: LBB6_30: ## %else28 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE4-NEXT: je LBB6_32 -; SSE4-NEXT: ## %bb.31: ## %cond.store29 +; SSE4-NEXT: LBB6_31: ## %cond.store29 ; SSE4-NEXT: extractps $3, %xmm3, 60(%rdi) -; SSE4-NEXT: LBB6_32: ## %else30 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: store_v16f32_v16i32: @@ -893,46 +830,39 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) { ; SSE2-LABEL: store_v2i64_v2i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB7_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB7_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB7_3 +; SSE2-NEXT: LBB7_4: ## %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB7_1: ## %cond.store ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: LBB7_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB7_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: movq %xmm0, 8(%rdi) -; SSE2-NEXT: LBB7_4: ## %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v2i64_v2i64: ; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: movmskpd %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB7_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB7_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB7_3 +; SSE4-NEXT: LBB7_4: ## %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB7_1: ## %cond.store ; SSE4-NEXT: movq %xmm1, (%rdi) -; SSE4-NEXT: LBB7_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB7_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB7_3: ## %cond.store1 ; SSE4-NEXT: pextrq $1, %xmm1, 8(%rdi) -; SSE4-NEXT: LBB7_4: ## %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v2i64_v2i64: @@ -977,81 +907,70 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %val) { ; SSE2-LABEL: store_v4i64_v4i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB8_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB8_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB8_3 +; SSE2-NEXT: LBB8_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB8_5 +; SSE2-NEXT: LBB8_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB8_7 +; SSE2-NEXT: LBB8_8: ## %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB8_1: ## %cond.store ; SSE2-NEXT: movq %xmm2, (%rdi) -; SSE2-NEXT: LBB8_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB8_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, 8(%rdi) -; SSE2-NEXT: LBB8_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB8_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 +; SSE2-NEXT: LBB8_5: ## %cond.store3 ; SSE2-NEXT: movq %xmm3, 16(%rdi) -; SSE2-NEXT: LBB8_6: ## %else4 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB8_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB8_7: ## %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; SSE2-NEXT: movq %xmm0, 24(%rdi) -; SSE2-NEXT: LBB8_8: ## %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v4i64_v4i64: ; SSE4: ## %bb.0: -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm4 -; SSE4-NEXT: pextrb $0, %xmm4, %eax +; SSE4-NEXT: packssdw %xmm1, %xmm0 +; SSE4-NEXT: movmskps %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB8_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB8_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB8_3 +; SSE4-NEXT: LBB8_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB8_5 +; SSE4-NEXT: LBB8_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB8_7 +; SSE4-NEXT: LBB8_8: ## %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB8_1: ## %cond.store ; SSE4-NEXT: movq %xmm2, (%rdi) -; SSE4-NEXT: LBB8_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB8_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB8_3: ## %cond.store1 ; SSE4-NEXT: pextrq $1, %xmm2, 8(%rdi) -; SSE4-NEXT: LBB8_4: ## %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB8_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB8_5: ## %cond.store3 ; SSE4-NEXT: movq %xmm3, 16(%rdi) -; SSE4-NEXT: LBB8_6: ## %else4 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB8_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB8_7: ## %cond.store5 ; SSE4-NEXT: pextrq $1, %xmm3, 24(%rdi) -; SSE4-NEXT: LBB8_8: ## %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v4i64_v4i64: @@ -1132,19 +1051,21 @@ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB10_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB10_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB10_3 +; SSE2-NEXT: LBB10_4: ## %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB10_1: ## %cond.store ; SSE2-NEXT: movd %xmm1, (%rdi) -; SSE2-NEXT: LBB10_2: ## %else -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB10_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 +; SSE2-NEXT: LBB10_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 4(%rdi) -; SSE2-NEXT: LBB10_4: ## %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v2i32_v2i32: @@ -1152,18 +1073,20 @@ ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE4-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax +; SSE4-NEXT: movmskpd %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB10_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB10_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB10_3 +; SSE4-NEXT: LBB10_4: ## %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB10_1: ## %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: LBB10_2: ## %else -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB10_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB10_3: ## %cond.store1 ; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi) -; SSE4-NEXT: LBB10_4: ## %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v2i32_v2i32: @@ -1215,67 +1138,71 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB11_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB11_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB11_3 +; SSE2-NEXT: LBB11_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB11_5 +; SSE2-NEXT: LBB11_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB11_7 +; SSE2-NEXT: LBB11_8: ## %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB11_1: ## %cond.store ; SSE2-NEXT: movd %xmm1, (%rdi) -; SSE2-NEXT: LBB11_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB11_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm2, 4(%rdi) -; SSE2-NEXT: LBB11_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB11_3: ## %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB11_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm2, 8(%rdi) -; SSE2-NEXT: LBB11_6: ## %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB11_5: ## %cond.store3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB11_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB11_7: ## %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) -; SSE2-NEXT: LBB11_8: ## %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v4i32_v4i32: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: movmskps %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB11_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB11_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB11_3 +; SSE4-NEXT: LBB11_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB11_5 +; SSE4-NEXT: LBB11_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB11_7 +; SSE4-NEXT: LBB11_8: ## %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB11_1: ## %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: LBB11_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB11_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB11_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) -; SSE4-NEXT: LBB11_4: ## %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB11_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB11_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) -; SSE4-NEXT: LBB11_6: ## %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB11_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB11_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) -; SSE4-NEXT: LBB11_8: ## %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v4i32_v4i32: @@ -1317,131 +1244,137 @@ ; SSE2-LABEL: store_v8i32_v8i32: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 -; SSE2-NEXT: movd %xmm5, %eax +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB12_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB12_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB12_3 +; SSE2-NEXT: LBB12_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB12_5 +; SSE2-NEXT: LBB12_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB12_7 +; SSE2-NEXT: LBB12_8: ## %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB12_9 +; SSE2-NEXT: LBB12_10: ## %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB12_11 +; SSE2-NEXT: LBB12_12: ## %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB12_13 +; SSE2-NEXT: LBB12_14: ## %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB12_15 +; SSE2-NEXT: LBB12_16: ## %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB12_1: ## %cond.store ; SSE2-NEXT: movd %xmm2, (%rdi) -; SSE2-NEXT: LBB12_2: ## %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB12_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,3] -; SSE2-NEXT: movd %xmm4, 4(%rdi) -; SSE2-NEXT: LBB12_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB12_3: ## %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB12_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm4, 8(%rdi) -; SSE2-NEXT: LBB12_6: ## %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB12_5: ## %cond.store3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB12_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB12_7: ## %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) -; SSE2-NEXT: LBB12_8: ## %else6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB12_10 -; SSE2-NEXT: ## %bb.9: ## %cond.store7 +; SSE2-NEXT: LBB12_9: ## %cond.store7 ; SSE2-NEXT: movd %xmm3, 16(%rdi) -; SSE2-NEXT: LBB12_10: ## %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB12_12 -; SSE2-NEXT: ## %bb.11: ## %cond.store9 +; SSE2-NEXT: LBB12_11: ## %cond.store9 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 20(%rdi) -; SSE2-NEXT: LBB12_12: ## %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB12_14 -; SSE2-NEXT: ## %bb.13: ## %cond.store11 +; SSE2-NEXT: LBB12_13: ## %cond.store11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 24(%rdi) -; SSE2-NEXT: LBB12_14: ## %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB12_16 -; SSE2-NEXT: ## %bb.15: ## %cond.store13 +; SSE2-NEXT: LBB12_15: ## %cond.store13 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 28(%rdi) -; SSE2-NEXT: LBB12_16: ## %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v8i32_v8i32: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE4-NEXT: pextrb $0, %xmm4, %eax +; SSE4-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE4-NEXT: packssdw %xmm1, %xmm0 +; SSE4-NEXT: packsswb %xmm0, %xmm0 +; SSE4-NEXT: pmovmskb %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB12_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB12_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB12_3 +; SSE4-NEXT: LBB12_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB12_5 +; SSE4-NEXT: LBB12_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB12_7 +; SSE4-NEXT: LBB12_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB12_9 +; SSE4-NEXT: LBB12_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB12_11 +; SSE4-NEXT: LBB12_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB12_13 +; SSE4-NEXT: LBB12_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB12_15 +; SSE4-NEXT: LBB12_16: ## %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB12_1: ## %cond.store ; SSE4-NEXT: movss %xmm2, (%rdi) -; SSE4-NEXT: LBB12_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB12_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB12_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi) -; SSE4-NEXT: LBB12_4: ## %else2 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB12_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB12_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi) -; SSE4-NEXT: LBB12_6: ## %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB12_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB12_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi) -; SSE4-NEXT: LBB12_8: ## %else6 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB12_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB12_9: ## %cond.store7 ; SSE4-NEXT: movss %xmm3, 16(%rdi) -; SSE4-NEXT: LBB12_10: ## %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB12_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB12_11: ## %cond.store9 ; SSE4-NEXT: extractps $1, %xmm3, 20(%rdi) -; SSE4-NEXT: LBB12_12: ## %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB12_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB12_13: ## %cond.store11 ; SSE4-NEXT: extractps $2, %xmm3, 24(%rdi) -; SSE4-NEXT: LBB12_14: ## %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB12_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB12_15: ## %cond.store13 ; SSE4-NEXT: extractps $3, %xmm3, 28(%rdi) -; SSE4-NEXT: LBB12_16: ## %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v8i32_v8i32: @@ -1494,339 +1427,328 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB13_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB13_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB13_3 +; SSE2-NEXT: LBB13_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB13_5 +; SSE2-NEXT: LBB13_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB13_7 +; SSE2-NEXT: LBB13_8: ## %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB13_9 +; SSE2-NEXT: LBB13_10: ## %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB13_11 +; SSE2-NEXT: LBB13_12: ## %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB13_13 +; SSE2-NEXT: LBB13_14: ## %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB13_15 +; SSE2-NEXT: LBB13_16: ## %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB13_1: ## %cond.store ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, (%rdi) -; SSE2-NEXT: LBB13_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB13_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: LBB13_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_3: ## %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB13_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) -; SSE2-NEXT: LBB13_6: ## %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_5: ## %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB13_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: LBB13_8: ## %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_7: ## %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB13_10 -; SSE2-NEXT: ## %bb.9: ## %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) -; SSE2-NEXT: LBB13_10: ## %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_9: ## %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB13_12 -; SSE2-NEXT: ## %bb.11: ## %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) -; SSE2-NEXT: LBB13_12: ## %else10 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_11: ## %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB13_14 -; SSE2-NEXT: ## %bb.13: ## %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: LBB13_14: ## %else12 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB13_13: ## %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB13_16 -; SSE2-NEXT: ## %bb.15: ## %cond.store13 +; SSE2-NEXT: LBB13_15: ## %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm1, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: LBB13_16: ## %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v8i16_v8i16: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB13_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB13_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB13_3 +; SSE4-NEXT: LBB13_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB13_5 +; SSE4-NEXT: LBB13_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB13_7 +; SSE4-NEXT: LBB13_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB13_9 +; SSE4-NEXT: LBB13_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB13_11 +; SSE4-NEXT: LBB13_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB13_13 +; SSE4-NEXT: LBB13_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB13_15 +; SSE4-NEXT: LBB13_16: ## %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB13_1: ## %cond.store ; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) -; SSE4-NEXT: LBB13_2: ## %else -; SSE4-NEXT: pextrb $2, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB13_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB13_3: ## %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi) -; SSE4-NEXT: LBB13_4: ## %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB13_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB13_5: ## %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi) -; SSE4-NEXT: LBB13_6: ## %else4 -; SSE4-NEXT: pextrb $6, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB13_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB13_7: ## %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi) -; SSE4-NEXT: LBB13_8: ## %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB13_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB13_9: ## %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi) -; SSE4-NEXT: LBB13_10: ## %else8 -; SSE4-NEXT: pextrb $10, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB13_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB13_11: ## %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi) -; SSE4-NEXT: LBB13_12: ## %else10 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB13_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB13_13: ## %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi) -; SSE4-NEXT: LBB13_14: ## %else12 -; SSE4-NEXT: pextrb $14, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB13_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB13_15: ## %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi) -; SSE4-NEXT: LBB13_16: ## %else14 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: store_v8i16_v8i16: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $0, %xmm2, %eax +; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax ; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB13_2 -; AVX1OR2-NEXT: ## %bb.1: ## %cond.store +; AVX1OR2-NEXT: jne LBB13_1 +; AVX1OR2-NEXT: ## %bb.2: ## %else +; AVX1OR2-NEXT: testb $2, %al +; AVX1OR2-NEXT: jne LBB13_3 +; AVX1OR2-NEXT: LBB13_4: ## %else2 +; AVX1OR2-NEXT: testb $4, %al +; AVX1OR2-NEXT: jne LBB13_5 +; AVX1OR2-NEXT: LBB13_6: ## %else4 +; AVX1OR2-NEXT: testb $8, %al +; AVX1OR2-NEXT: jne LBB13_7 +; AVX1OR2-NEXT: LBB13_8: ## %else6 +; AVX1OR2-NEXT: testb $16, %al +; AVX1OR2-NEXT: jne LBB13_9 +; AVX1OR2-NEXT: LBB13_10: ## %else8 +; AVX1OR2-NEXT: testb $32, %al +; AVX1OR2-NEXT: jne LBB13_11 +; AVX1OR2-NEXT: LBB13_12: ## %else10 +; AVX1OR2-NEXT: testb $64, %al +; AVX1OR2-NEXT: jne LBB13_13 +; AVX1OR2-NEXT: LBB13_14: ## %else12 +; AVX1OR2-NEXT: testb $-128, %al +; AVX1OR2-NEXT: jne LBB13_15 +; AVX1OR2-NEXT: LBB13_16: ## %else14 +; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: LBB13_1: ## %cond.store ; AVX1OR2-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX1OR2-NEXT: LBB13_2: ## %else -; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $2, %al ; AVX1OR2-NEXT: je LBB13_4 -; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1 +; AVX1OR2-NEXT: LBB13_3: ## %cond.store1 ; AVX1OR2-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX1OR2-NEXT: LBB13_4: ## %else2 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $4, %al ; AVX1OR2-NEXT: je LBB13_6 -; AVX1OR2-NEXT: ## %bb.5: ## %cond.store3 +; AVX1OR2-NEXT: LBB13_5: ## %cond.store3 ; AVX1OR2-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX1OR2-NEXT: LBB13_6: ## %else4 -; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $8, %al ; AVX1OR2-NEXT: je LBB13_8 -; AVX1OR2-NEXT: ## %bb.7: ## %cond.store5 +; AVX1OR2-NEXT: LBB13_7: ## %cond.store5 ; AVX1OR2-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX1OR2-NEXT: LBB13_8: ## %else6 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $16, %al ; AVX1OR2-NEXT: je LBB13_10 -; AVX1OR2-NEXT: ## %bb.9: ## %cond.store7 +; AVX1OR2-NEXT: LBB13_9: ## %cond.store7 ; AVX1OR2-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX1OR2-NEXT: LBB13_10: ## %else8 -; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $32, %al ; AVX1OR2-NEXT: je LBB13_12 -; AVX1OR2-NEXT: ## %bb.11: ## %cond.store9 +; AVX1OR2-NEXT: LBB13_11: ## %cond.store9 ; AVX1OR2-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX1OR2-NEXT: LBB13_12: ## %else10 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpextrb $12, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: je LBB13_14 -; AVX1OR2-NEXT: ## %bb.13: ## %cond.store11 +; AVX1OR2-NEXT: LBB13_13: ## %cond.store11 ; AVX1OR2-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX1OR2-NEXT: LBB13_14: ## %else12 -; AVX1OR2-NEXT: vpextrb $14, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $-128, %al ; AVX1OR2-NEXT: je LBB13_16 -; AVX1OR2-NEXT: ## %bb.15: ## %cond.store13 +; AVX1OR2-NEXT: LBB13_15: ## %cond.store13 ; AVX1OR2-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX1OR2-NEXT: LBB13_16: ## %else14 ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: store_v8i16_v8i16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB13_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.store +; AVX512F-NEXT: jne LBB13_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB13_3 +; AVX512F-NEXT: LBB13_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB13_5 +; AVX512F-NEXT: LBB13_6: ## %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB13_7 +; AVX512F-NEXT: LBB13_8: ## %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB13_9 +; AVX512F-NEXT: LBB13_10: ## %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB13_11 +; AVX512F-NEXT: LBB13_12: ## %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB13_13 +; AVX512F-NEXT: LBB13_14: ## %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB13_15 +; AVX512F-NEXT: LBB13_16: ## %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB13_1: ## %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX512F-NEXT: LBB13_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB13_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.store1 +; AVX512F-NEXT: LBB13_3: ## %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX512F-NEXT: LBB13_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB13_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.store3 +; AVX512F-NEXT: LBB13_5: ## %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX512F-NEXT: LBB13_6: ## %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB13_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.store5 +; AVX512F-NEXT: LBB13_7: ## %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX512F-NEXT: LBB13_8: ## %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB13_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.store7 +; AVX512F-NEXT: LBB13_9: ## %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX512F-NEXT: LBB13_10: ## %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB13_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.store9 +; AVX512F-NEXT: LBB13_11: ## %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX512F-NEXT: LBB13_12: ## %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB13_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.store11 +; AVX512F-NEXT: LBB13_13: ## %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX512F-NEXT: LBB13_14: ## %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je LBB13_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.store13 +; AVX512F-NEXT: LBB13_15: ## %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX512F-NEXT: LBB13_16: ## %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: store_v8i16_v8i16: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB13_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store -; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX512VLDQ-NEXT: LBB13_2: ## %else -; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 ; AVX512VLDQ-NEXT: kmovw %k0, %eax ; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: jne LBB13_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB13_3 +; AVX512VLDQ-NEXT: LBB13_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB13_5 +; AVX512VLDQ-NEXT: LBB13_6: ## %else4 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB13_7 +; AVX512VLDQ-NEXT: LBB13_8: ## %else6 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB13_9 +; AVX512VLDQ-NEXT: LBB13_10: ## %else8 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB13_11 +; AVX512VLDQ-NEXT: LBB13_12: ## %else10 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB13_13 +; AVX512VLDQ-NEXT: LBB13_14: ## %else12 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB13_15 +; AVX512VLDQ-NEXT: LBB13_16: ## %else14 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB13_1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB13_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: LBB13_3: ## %cond.store1 ; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX512VLDQ-NEXT: LBB13_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 -; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB13_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: LBB13_5: ## %cond.store3 ; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX512VLDQ-NEXT: LBB13_6: ## %else4 -; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB13_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: LBB13_7: ## %cond.store5 ; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX512VLDQ-NEXT: LBB13_8: ## %else6 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 -; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 -; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB13_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: LBB13_9: ## %cond.store7 ; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX512VLDQ-NEXT: LBB13_10: ## %else8 -; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB13_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: LBB13_11: ## %cond.store9 ; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX512VLDQ-NEXT: LBB13_12: ## %else10 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB13_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: LBB13_13: ## %cond.store11 ; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX512VLDQ-NEXT: LBB13_14: ## %else12 -; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al ; AVX512VLDQ-NEXT: je LBB13_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: LBB13_15: ## %cond.store13 ; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX512VLDQ-NEXT: LBB13_16: ## %else14 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; @@ -1844,792 +1766,745 @@ ; SSE2-LABEL: store_v16i16_v16i16: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: pcmpeqw %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE2-NEXT: packsswb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB14_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB14_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB14_3 +; SSE2-NEXT: LBB14_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB14_5 +; SSE2-NEXT: LBB14_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB14_7 +; SSE2-NEXT: LBB14_8: ## %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne LBB14_9 +; SSE2-NEXT: LBB14_10: ## %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne LBB14_11 +; SSE2-NEXT: LBB14_12: ## %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne LBB14_13 +; SSE2-NEXT: LBB14_14: ## %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne LBB14_15 +; SSE2-NEXT: LBB14_16: ## %else14 +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: jne LBB14_17 +; SSE2-NEXT: LBB14_18: ## %else16 +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 +; SSE2-NEXT: jne LBB14_19 +; SSE2-NEXT: LBB14_20: ## %else18 +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: jne LBB14_21 +; SSE2-NEXT: LBB14_22: ## %else20 +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE2-NEXT: jne LBB14_23 +; SSE2-NEXT: LBB14_24: ## %else22 +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: jne LBB14_25 +; SSE2-NEXT: LBB14_26: ## %else24 +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE2-NEXT: jne LBB14_27 +; SSE2-NEXT: LBB14_28: ## %else26 +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: jne LBB14_29 +; SSE2-NEXT: LBB14_30: ## %else28 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: jne LBB14_31 +; SSE2-NEXT: LBB14_32: ## %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB14_1: ## %cond.store ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, (%rdi) -; SSE2-NEXT: LBB14_2: ## %else -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB14_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: LBB14_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_3: ## %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB14_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) -; SSE2-NEXT: LBB14_6: ## %else4 -; SSE2-NEXT: pextrw $3, %xmm4, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_5: ## %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB14_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: LBB14_8: ## %else6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_7: ## %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB14_10 -; SSE2-NEXT: ## %bb.9: ## %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) -; SSE2-NEXT: LBB14_10: ## %else8 -; SSE2-NEXT: pextrw $5, %xmm4, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_9: ## %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB14_12 -; SSE2-NEXT: ## %bb.11: ## %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) -; SSE2-NEXT: LBB14_12: ## %else10 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm4, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_11: ## %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB14_14 -; SSE2-NEXT: ## %bb.13: ## %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: LBB14_14: ## %else12 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_13: ## %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB14_16 -; SSE2-NEXT: ## %bb.15: ## %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: LBB14_16: ## %else14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_15: ## %cond.store13 +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 14(%rdi) +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB14_18 -; SSE2-NEXT: ## %bb.17: ## %cond.store15 +; SSE2-NEXT: LBB14_17: ## %cond.store15 ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: movw %cx, 16(%rdi) -; SSE2-NEXT: LBB14_18: ## %else16 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 ; SSE2-NEXT: je LBB14_20 -; SSE2-NEXT: ## %bb.19: ## %cond.store17 -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: movw %ax, 18(%rdi) -; SSE2-NEXT: LBB14_20: ## %else18 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_19: ## %cond.store17 +; SSE2-NEXT: pextrw $1, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 18(%rdi) +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB14_22 -; SSE2-NEXT: ## %bb.21: ## %cond.store19 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: movw %ax, 20(%rdi) -; SSE2-NEXT: LBB14_22: ## %else20 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_21: ## %cond.store19 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 20(%rdi) +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB14_24 -; SSE2-NEXT: ## %bb.23: ## %cond.store21 -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: movw %ax, 22(%rdi) -; SSE2-NEXT: LBB14_24: ## %else22 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_23: ## %cond.store21 +; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 22(%rdi) +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB14_26 -; SSE2-NEXT: ## %bb.25: ## %cond.store23 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: movw %ax, 24(%rdi) -; SSE2-NEXT: LBB14_26: ## %else24 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_25: ## %cond.store23 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 24(%rdi) +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE2-NEXT: je LBB14_28 -; SSE2-NEXT: ## %bb.27: ## %cond.store25 -; SSE2-NEXT: pextrw $5, %xmm3, %eax -; SSE2-NEXT: movw %ax, 26(%rdi) -; SSE2-NEXT: LBB14_28: ## %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_27: ## %cond.store25 +; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 26(%rdi) +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB14_30 -; SSE2-NEXT: ## %bb.29: ## %cond.store27 -; SSE2-NEXT: pextrw $6, %xmm3, %eax -; SSE2-NEXT: movw %ax, 28(%rdi) -; SSE2-NEXT: LBB14_30: ## %else28 -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB14_29: ## %cond.store27 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: movw %cx, 28(%rdi) +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE2-NEXT: je LBB14_32 -; SSE2-NEXT: ## %bb.31: ## %cond.store29 +; SSE2-NEXT: LBB14_31: ## %cond.store29 ; SSE2-NEXT: pextrw $7, %xmm3, %eax ; SSE2-NEXT: movw %ax, 30(%rdi) -; SSE2-NEXT: LBB14_32: ## %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v16i16_v16i16: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE4-NEXT: pextrb $0, %xmm4, %eax +; SSE4-NEXT: pcmpeqw %xmm4, %xmm1 +; SSE4-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE4-NEXT: packsswb %xmm1, %xmm0 +; SSE4-NEXT: pmovmskb %xmm0, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB14_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB14_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB14_3 +; SSE4-NEXT: LBB14_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB14_5 +; SSE4-NEXT: LBB14_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB14_7 +; SSE4-NEXT: LBB14_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB14_9 +; SSE4-NEXT: LBB14_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB14_11 +; SSE4-NEXT: LBB14_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB14_13 +; SSE4-NEXT: LBB14_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB14_15 +; SSE4-NEXT: LBB14_16: ## %else14 +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 +; SSE4-NEXT: jne LBB14_17 +; SSE4-NEXT: LBB14_18: ## %else16 +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 +; SSE4-NEXT: jne LBB14_19 +; SSE4-NEXT: LBB14_20: ## %else18 +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE4-NEXT: jne LBB14_21 +; SSE4-NEXT: LBB14_22: ## %else20 +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE4-NEXT: jne LBB14_23 +; SSE4-NEXT: LBB14_24: ## %else22 +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE4-NEXT: jne LBB14_25 +; SSE4-NEXT: LBB14_26: ## %else24 +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE4-NEXT: jne LBB14_27 +; SSE4-NEXT: LBB14_28: ## %else26 +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE4-NEXT: jne LBB14_29 +; SSE4-NEXT: LBB14_30: ## %else28 +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE4-NEXT: jne LBB14_31 +; SSE4-NEXT: LBB14_32: ## %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB14_1: ## %cond.store ; SSE4-NEXT: pextrw $0, %xmm2, (%rdi) -; SSE4-NEXT: LBB14_2: ## %else -; SSE4-NEXT: pextrb $2, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB14_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB14_3: ## %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm2, 2(%rdi) -; SSE4-NEXT: LBB14_4: ## %else2 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB14_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB14_5: ## %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm2, 4(%rdi) -; SSE4-NEXT: LBB14_6: ## %else4 -; SSE4-NEXT: pextrb $6, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB14_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB14_7: ## %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm2, 6(%rdi) -; SSE4-NEXT: LBB14_8: ## %else6 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm4 -; SSE4-NEXT: pextrb $8, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB14_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB14_9: ## %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm2, 8(%rdi) -; SSE4-NEXT: LBB14_10: ## %else8 -; SSE4-NEXT: pextrb $10, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB14_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB14_11: ## %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm2, 10(%rdi) -; SSE4-NEXT: LBB14_12: ## %else10 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqw %xmm4, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB14_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB14_13: ## %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm2, 12(%rdi) -; SSE4-NEXT: LBB14_14: ## %else12 -; SSE4-NEXT: pextrb $14, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB14_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB14_15: ## %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm2, 14(%rdi) -; SSE4-NEXT: LBB14_16: ## %else14 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 ; SSE4-NEXT: je LBB14_18 -; SSE4-NEXT: ## %bb.17: ## %cond.store15 +; SSE4-NEXT: LBB14_17: ## %cond.store15 ; SSE4-NEXT: pextrw $0, %xmm3, 16(%rdi) -; SSE4-NEXT: LBB14_18: ## %else16 -; SSE4-NEXT: pextrb $2, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 ; SSE4-NEXT: je LBB14_20 -; SSE4-NEXT: ## %bb.19: ## %cond.store17 +; SSE4-NEXT: LBB14_19: ## %cond.store17 ; SSE4-NEXT: pextrw $1, %xmm3, 18(%rdi) -; SSE4-NEXT: LBB14_20: ## %else18 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE4-NEXT: je LBB14_22 -; SSE4-NEXT: ## %bb.21: ## %cond.store19 +; SSE4-NEXT: LBB14_21: ## %cond.store19 ; SSE4-NEXT: pextrw $2, %xmm3, 20(%rdi) -; SSE4-NEXT: LBB14_22: ## %else20 -; SSE4-NEXT: pextrb $6, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE4-NEXT: je LBB14_24 -; SSE4-NEXT: ## %bb.23: ## %cond.store21 +; SSE4-NEXT: LBB14_23: ## %cond.store21 ; SSE4-NEXT: pextrw $3, %xmm3, 22(%rdi) -; SSE4-NEXT: LBB14_24: ## %else22 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE4-NEXT: je LBB14_26 -; SSE4-NEXT: ## %bb.25: ## %cond.store23 +; SSE4-NEXT: LBB14_25: ## %cond.store23 ; SSE4-NEXT: pextrw $4, %xmm3, 24(%rdi) -; SSE4-NEXT: LBB14_26: ## %else24 -; SSE4-NEXT: pextrb $10, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE4-NEXT: je LBB14_28 -; SSE4-NEXT: ## %bb.27: ## %cond.store25 +; SSE4-NEXT: LBB14_27: ## %cond.store25 ; SSE4-NEXT: pextrw $5, %xmm3, 26(%rdi) -; SSE4-NEXT: LBB14_28: ## %else26 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE4-NEXT: je LBB14_30 -; SSE4-NEXT: ## %bb.29: ## %cond.store27 +; SSE4-NEXT: LBB14_29: ## %cond.store27 ; SSE4-NEXT: pextrw $6, %xmm3, 28(%rdi) -; SSE4-NEXT: LBB14_30: ## %else28 -; SSE4-NEXT: pextrb $14, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE4-NEXT: je LBB14_32 -; SSE4-NEXT: ## %bb.31: ## %cond.store29 +; SSE4-NEXT: LBB14_31: ## %cond.store29 ; SSE4-NEXT: pextrw $7, %xmm3, 30(%rdi) -; SSE4-NEXT: LBB14_32: ## %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v16i16_v16i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB14_2 -; AVX1-NEXT: ## %bb.1: ## %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB14_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB14_3 +; AVX1-NEXT: LBB14_4: ## %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB14_5 +; AVX1-NEXT: LBB14_6: ## %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB14_7 +; AVX1-NEXT: LBB14_8: ## %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne LBB14_9 +; AVX1-NEXT: LBB14_10: ## %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne LBB14_11 +; AVX1-NEXT: LBB14_12: ## %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne LBB14_13 +; AVX1-NEXT: LBB14_14: ## %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je LBB14_16 +; AVX1-NEXT: LBB14_15: ## %cond.store13 +; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX1-NEXT: LBB14_16: ## %else14 +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: jne LBB14_17 +; AVX1-NEXT: ## %bb.18: ## %else16 +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: jne LBB14_19 +; AVX1-NEXT: LBB14_20: ## %else18 +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: jne LBB14_21 +; AVX1-NEXT: LBB14_22: ## %else20 +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: jne LBB14_23 +; AVX1-NEXT: LBB14_24: ## %else22 +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: jne LBB14_25 +; AVX1-NEXT: LBB14_26: ## %else24 +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: jne LBB14_27 +; AVX1-NEXT: LBB14_28: ## %else26 +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: jne LBB14_29 +; AVX1-NEXT: LBB14_30: ## %else28 +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: jne LBB14_31 +; AVX1-NEXT: LBB14_32: ## %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: LBB14_1: ## %cond.store ; AVX1-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX1-NEXT: LBB14_2: ## %else -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je LBB14_4 -; AVX1-NEXT: ## %bb.3: ## %cond.store1 +; AVX1-NEXT: LBB14_3: ## %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX1-NEXT: LBB14_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB14_6 -; AVX1-NEXT: ## %bb.5: ## %cond.store3 +; AVX1-NEXT: LBB14_5: ## %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX1-NEXT: LBB14_6: ## %else4 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB14_8 -; AVX1-NEXT: ## %bb.7: ## %cond.store5 +; AVX1-NEXT: LBB14_7: ## %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX1-NEXT: LBB14_8: ## %else6 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB14_10 -; AVX1-NEXT: ## %bb.9: ## %cond.store7 +; AVX1-NEXT: LBB14_9: ## %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX1-NEXT: LBB14_10: ## %else8 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je LBB14_12 -; AVX1-NEXT: ## %bb.11: ## %cond.store9 +; AVX1-NEXT: LBB14_11: ## %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX1-NEXT: LBB14_12: ## %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB14_14 -; AVX1-NEXT: ## %bb.13: ## %cond.store11 +; AVX1-NEXT: LBB14_13: ## %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX1-NEXT: LBB14_14: ## %else12 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB14_16 -; AVX1-NEXT: ## %bb.15: ## %cond.store13 -; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX1-NEXT: LBB14_16: ## %else14 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: je LBB14_18 -; AVX1-NEXT: ## %bb.17: ## %cond.store15 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne LBB14_15 +; AVX1-NEXT: jmp LBB14_16 +; AVX1-NEXT: LBB14_17: ## %cond.store15 ; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX1-NEXT: LBB14_18: ## %else16 -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 ; AVX1-NEXT: je LBB14_20 -; AVX1-NEXT: ## %bb.19: ## %cond.store17 +; AVX1-NEXT: LBB14_19: ## %cond.store17 ; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX1-NEXT: LBB14_20: ## %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1-NEXT: je LBB14_22 -; AVX1-NEXT: ## %bb.21: ## %cond.store19 +; AVX1-NEXT: LBB14_21: ## %cond.store19 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX1-NEXT: LBB14_22: ## %else20 -; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1-NEXT: je LBB14_24 -; AVX1-NEXT: ## %bb.23: ## %cond.store21 +; AVX1-NEXT: LBB14_23: ## %cond.store21 ; AVX1-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX1-NEXT: LBB14_24: ## %else22 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB14_26 -; AVX1-NEXT: ## %bb.25: ## %cond.store23 +; AVX1-NEXT: LBB14_25: ## %cond.store23 ; AVX1-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX1-NEXT: LBB14_26: ## %else24 -; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1-NEXT: je LBB14_28 -; AVX1-NEXT: ## %bb.27: ## %cond.store25 +; AVX1-NEXT: LBB14_27: ## %cond.store25 ; AVX1-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX1-NEXT: LBB14_28: ## %else26 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB14_30 -; AVX1-NEXT: ## %bb.29: ## %cond.store27 +; AVX1-NEXT: LBB14_29: ## %cond.store27 ; AVX1-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX1-NEXT: LBB14_30: ## %else28 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB14_32 -; AVX1-NEXT: ## %bb.31: ## %cond.store29 +; AVX1-NEXT: LBB14_31: ## %cond.store29 ; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX1-NEXT: LBB14_32: ## %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_v16i16_v16i16: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB14_2 -; AVX2-NEXT: ## %bb.1: ## %cond.store +; AVX2-NEXT: jne LBB14_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB14_3 +; AVX2-NEXT: LBB14_4: ## %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB14_5 +; AVX2-NEXT: LBB14_6: ## %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB14_7 +; AVX2-NEXT: LBB14_8: ## %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne LBB14_9 +; AVX2-NEXT: LBB14_10: ## %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne LBB14_11 +; AVX2-NEXT: LBB14_12: ## %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne LBB14_13 +; AVX2-NEXT: LBB14_14: ## %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je LBB14_16 +; AVX2-NEXT: LBB14_15: ## %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX2-NEXT: LBB14_16: ## %else14 +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: jne LBB14_17 +; AVX2-NEXT: ## %bb.18: ## %else16 +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: jne LBB14_19 +; AVX2-NEXT: LBB14_20: ## %else18 +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: jne LBB14_21 +; AVX2-NEXT: LBB14_22: ## %else20 +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: jne LBB14_23 +; AVX2-NEXT: LBB14_24: ## %else22 +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: jne LBB14_25 +; AVX2-NEXT: LBB14_26: ## %else24 +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: jne LBB14_27 +; AVX2-NEXT: LBB14_28: ## %else26 +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: jne LBB14_29 +; AVX2-NEXT: LBB14_30: ## %else28 +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: jne LBB14_31 +; AVX2-NEXT: LBB14_32: ## %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: LBB14_1: ## %cond.store ; AVX2-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX2-NEXT: LBB14_2: ## %else -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je LBB14_4 -; AVX2-NEXT: ## %bb.3: ## %cond.store1 +; AVX2-NEXT: LBB14_3: ## %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX2-NEXT: LBB14_4: ## %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB14_6 -; AVX2-NEXT: ## %bb.5: ## %cond.store3 +; AVX2-NEXT: LBB14_5: ## %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX2-NEXT: LBB14_6: ## %else4 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB14_8 -; AVX2-NEXT: ## %bb.7: ## %cond.store5 +; AVX2-NEXT: LBB14_7: ## %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX2-NEXT: LBB14_8: ## %else6 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB14_10 -; AVX2-NEXT: ## %bb.9: ## %cond.store7 +; AVX2-NEXT: LBB14_9: ## %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX2-NEXT: LBB14_10: ## %else8 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB14_12 -; AVX2-NEXT: ## %bb.11: ## %cond.store9 +; AVX2-NEXT: LBB14_11: ## %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX2-NEXT: LBB14_12: ## %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB14_14 -; AVX2-NEXT: ## %bb.13: ## %cond.store11 +; AVX2-NEXT: LBB14_13: ## %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX2-NEXT: LBB14_14: ## %else12 -; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB14_16 -; AVX2-NEXT: ## %bb.15: ## %cond.store13 -; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX2-NEXT: LBB14_16: ## %else14 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: je LBB14_18 -; AVX2-NEXT: ## %bb.17: ## %cond.store15 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne LBB14_15 +; AVX2-NEXT: jmp LBB14_16 +; AVX2-NEXT: LBB14_17: ## %cond.store15 ; AVX2-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX2-NEXT: LBB14_18: ## %else16 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB14_20 -; AVX2-NEXT: ## %bb.19: ## %cond.store17 +; AVX2-NEXT: LBB14_19: ## %cond.store17 ; AVX2-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX2-NEXT: LBB14_20: ## %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB14_22 -; AVX2-NEXT: ## %bb.21: ## %cond.store19 +; AVX2-NEXT: LBB14_21: ## %cond.store19 ; AVX2-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX2-NEXT: LBB14_22: ## %else20 -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB14_24 -; AVX2-NEXT: ## %bb.23: ## %cond.store21 +; AVX2-NEXT: LBB14_23: ## %cond.store21 ; AVX2-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX2-NEXT: LBB14_24: ## %else22 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB14_26 -; AVX2-NEXT: ## %bb.25: ## %cond.store23 +; AVX2-NEXT: LBB14_25: ## %cond.store23 ; AVX2-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX2-NEXT: LBB14_26: ## %else24 -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB14_28 -; AVX2-NEXT: ## %bb.27: ## %cond.store25 +; AVX2-NEXT: LBB14_27: ## %cond.store25 ; AVX2-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX2-NEXT: LBB14_28: ## %else26 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB14_30 -; AVX2-NEXT: ## %bb.29: ## %cond.store27 +; AVX2-NEXT: LBB14_29: ## %cond.store27 ; AVX2-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX2-NEXT: LBB14_30: ## %else28 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB14_32 -; AVX2-NEXT: ## %bb.31: ## %cond.store29 +; AVX2-NEXT: LBB14_31: ## %cond.store29 ; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX2-NEXT: LBB14_32: ## %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: store_v16i16_v16i16: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB14_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.store +; AVX512F-NEXT: jne LBB14_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB14_3 +; AVX512F-NEXT: LBB14_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB14_5 +; AVX512F-NEXT: LBB14_6: ## %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB14_7 +; AVX512F-NEXT: LBB14_8: ## %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB14_9 +; AVX512F-NEXT: LBB14_10: ## %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB14_11 +; AVX512F-NEXT: LBB14_12: ## %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB14_13 +; AVX512F-NEXT: LBB14_14: ## %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je LBB14_16 +; AVX512F-NEXT: LBB14_15: ## %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB14_16: ## %else14 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: jne LBB14_17 +; AVX512F-NEXT: ## %bb.18: ## %else16 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB14_19 +; AVX512F-NEXT: LBB14_20: ## %else18 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB14_21 +; AVX512F-NEXT: LBB14_22: ## %else20 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB14_23 +; AVX512F-NEXT: LBB14_24: ## %else22 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB14_25 +; AVX512F-NEXT: LBB14_26: ## %else24 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB14_27 +; AVX512F-NEXT: LBB14_28: ## %else26 +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB14_29 +; AVX512F-NEXT: LBB14_30: ## %else28 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB14_31 +; AVX512F-NEXT: LBB14_32: ## %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB14_1: ## %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX512F-NEXT: LBB14_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB14_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.store1 +; AVX512F-NEXT: LBB14_3: ## %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX512F-NEXT: LBB14_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB14_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.store3 +; AVX512F-NEXT: LBB14_5: ## %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX512F-NEXT: LBB14_6: ## %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB14_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.store5 +; AVX512F-NEXT: LBB14_7: ## %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX512F-NEXT: LBB14_8: ## %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB14_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.store7 +; AVX512F-NEXT: LBB14_9: ## %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX512F-NEXT: LBB14_10: ## %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB14_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.store9 +; AVX512F-NEXT: LBB14_11: ## %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX512F-NEXT: LBB14_12: ## %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB14_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.store11 +; AVX512F-NEXT: LBB14_13: ## %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX512F-NEXT: LBB14_14: ## %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB14_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.store13 -; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX512F-NEXT: LBB14_16: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: je LBB14_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.store15 -; AVX512F-NEXT: vpextrw $0, %xmm1, 16(%rdi) -; AVX512F-NEXT: LBB14_18: ## %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB14_15 +; AVX512F-NEXT: jmp LBB14_16 +; AVX512F-NEXT: LBB14_17: ## %cond.store15 +; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB14_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.store17 -; AVX512F-NEXT: vpextrw $1, %xmm1, 18(%rdi) -; AVX512F-NEXT: LBB14_20: ## %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_19: ## %cond.store17 +; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB14_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.store19 -; AVX512F-NEXT: vpextrw $2, %xmm1, 20(%rdi) -; AVX512F-NEXT: LBB14_22: ## %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_21: ## %cond.store19 +; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB14_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.store21 -; AVX512F-NEXT: vpextrw $3, %xmm1, 22(%rdi) -; AVX512F-NEXT: LBB14_24: ## %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_23: ## %cond.store21 +; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB14_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.store23 -; AVX512F-NEXT: vpextrw $4, %xmm1, 24(%rdi) -; AVX512F-NEXT: LBB14_26: ## %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_25: ## %cond.store23 +; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB14_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.store25 -; AVX512F-NEXT: vpextrw $5, %xmm1, 26(%rdi) -; AVX512F-NEXT: LBB14_28: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_27: ## %cond.store25 +; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB14_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.store27 -; AVX512F-NEXT: vpextrw $6, %xmm1, 28(%rdi) -; AVX512F-NEXT: LBB14_30: ## %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: LBB14_29: ## %cond.store27 +; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB14_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.store29 -; AVX512F-NEXT: vpextrw $7, %xmm1, 30(%rdi) -; AVX512F-NEXT: LBB14_32: ## %else30 +; AVX512F-NEXT: LBB14_31: ## %cond.store29 +; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: store_v16i16_v16i16: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512VLDQ-NEXT: kmovw %k0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB14_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: jne LBB14_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB14_3 +; AVX512VLDQ-NEXT: LBB14_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB14_5 +; AVX512VLDQ-NEXT: LBB14_6: ## %else4 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB14_7 +; AVX512VLDQ-NEXT: LBB14_8: ## %else6 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB14_9 +; AVX512VLDQ-NEXT: LBB14_10: ## %else8 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB14_11 +; AVX512VLDQ-NEXT: LBB14_12: ## %else10 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB14_13 +; AVX512VLDQ-NEXT: LBB14_14: ## %else12 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: je LBB14_16 +; AVX512VLDQ-NEXT: LBB14_15: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512VLDQ-NEXT: LBB14_16: ## %else14 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: jne LBB14_17 +; AVX512VLDQ-NEXT: ## %bb.18: ## %else16 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB14_19 +; AVX512VLDQ-NEXT: LBB14_20: ## %else18 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB14_21 +; AVX512VLDQ-NEXT: LBB14_22: ## %else20 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB14_23 +; AVX512VLDQ-NEXT: LBB14_24: ## %else22 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB14_25 +; AVX512VLDQ-NEXT: LBB14_26: ## %else24 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB14_27 +; AVX512VLDQ-NEXT: LBB14_28: ## %else26 +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB14_29 +; AVX512VLDQ-NEXT: LBB14_30: ## %else28 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB14_31 +; AVX512VLDQ-NEXT: LBB14_32: ## %else30 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB14_1: ## %cond.store ; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) -; AVX512VLDQ-NEXT: LBB14_2: ## %else -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB14_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: LBB14_3: ## %cond.store1 ; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) -; AVX512VLDQ-NEXT: LBB14_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB14_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: LBB14_5: ## %cond.store3 ; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) -; AVX512VLDQ-NEXT: LBB14_6: ## %else4 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB14_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: LBB14_7: ## %cond.store5 ; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) -; AVX512VLDQ-NEXT: LBB14_8: ## %else6 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB14_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: LBB14_9: ## %cond.store7 ; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) -; AVX512VLDQ-NEXT: LBB14_10: ## %else8 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB14_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: LBB14_11: ## %cond.store9 ; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) -; AVX512VLDQ-NEXT: LBB14_12: ## %else10 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB14_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: LBB14_13: ## %cond.store11 ; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) -; AVX512VLDQ-NEXT: LBB14_14: ## %else12 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB14_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 -; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) -; AVX512VLDQ-NEXT: LBB14_16: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512VLDQ-NEXT: je LBB14_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 -; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, 16(%rdi) -; AVX512VLDQ-NEXT: LBB14_18: ## %else16 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB14_15 +; AVX512VLDQ-NEXT: jmp LBB14_16 +; AVX512VLDQ-NEXT: LBB14_17: ## %cond.store15 +; AVX512VLDQ-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB14_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 -; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 18(%rdi) -; AVX512VLDQ-NEXT: LBB14_20: ## %else18 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_19: ## %cond.store17 +; AVX512VLDQ-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB14_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 -; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 20(%rdi) -; AVX512VLDQ-NEXT: LBB14_22: ## %else20 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_21: ## %cond.store19 +; AVX512VLDQ-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB14_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 -; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 22(%rdi) -; AVX512VLDQ-NEXT: LBB14_24: ## %else22 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_23: ## %cond.store21 +; AVX512VLDQ-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB14_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 -; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 24(%rdi) -; AVX512VLDQ-NEXT: LBB14_26: ## %else24 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_25: ## %cond.store23 +; AVX512VLDQ-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB14_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 -; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 26(%rdi) -; AVX512VLDQ-NEXT: LBB14_28: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_27: ## %cond.store25 +; AVX512VLDQ-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB14_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 -; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 28(%rdi) -; AVX512VLDQ-NEXT: LBB14_30: ## %else28 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: LBB14_29: ## %cond.store27 +; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB14_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 -; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 30(%rdi) -; AVX512VLDQ-NEXT: LBB14_32: ## %else30 +; AVX512VLDQ-NEXT: LBB14_31: ## %cond.store29 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, 30(%rdi) ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; @@ -2653,656 +2528,579 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: je LBB15_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: LBB15_2: ## %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB15_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: jne LBB15_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB15_3 ; SSE2-NEXT: LBB15_4: ## %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je LBB15_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB15_5 ; SSE2-NEXT: LBB15_6: ## %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB15_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: LBB15_7: ## %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: LBB15_8: ## %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: pextrw $2, %xmm1, %ecx ; SSE2-NEXT: je LBB15_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store7 -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: LBB15_10: ## %else8 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB15_12 ; SSE2-NEXT: ## %bb.11: ## %cond.store9 -; SSE2-NEXT: movb %ah, 5(%rdi) +; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: LBB15_12: ## %else10 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $3, %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: pextrw $3, %xmm1, %ecx ; SSE2-NEXT: je LBB15_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store11 -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: LBB15_14: ## %else12 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB15_16 ; SSE2-NEXT: ## %bb.15: ## %cond.store13 -; SSE2-NEXT: movb %ah, 7(%rdi) +; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: LBB15_16: ## %else14 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx ; SSE2-NEXT: je LBB15_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store15 -; SSE2-NEXT: movb %al, 8(%rdi) +; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: LBB15_18: ## %else16 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 ; SSE2-NEXT: je LBB15_20 ; SSE2-NEXT: ## %bb.19: ## %cond.store17 -; SSE2-NEXT: movb %ah, 9(%rdi) +; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: LBB15_20: ## %else18 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $5, %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $5, %xmm1, %eax +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: pextrw $5, %xmm1, %ecx ; SSE2-NEXT: je LBB15_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store19 -; SSE2-NEXT: movb %al, 10(%rdi) +; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: LBB15_22: ## %else20 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB15_24 ; SSE2-NEXT: ## %bb.23: ## %cond.store21 -; SSE2-NEXT: movb %ah, 11(%rdi) +; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: LBB15_24: ## %else22 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $6, %xmm1, %eax +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx ; SSE2-NEXT: je LBB15_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store23 -; SSE2-NEXT: movb %al, 12(%rdi) +; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: LBB15_26: ## %else24 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE2-NEXT: je LBB15_28 ; SSE2-NEXT: ## %bb.27: ## %cond.store25 -; SSE2-NEXT: movb %ah, 13(%rdi) +; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: LBB15_28: ## %else26 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: je LBB15_30 -; SSE2-NEXT: ## %bb.29: ## %cond.store27 -; SSE2-NEXT: movb %al, 14(%rdi) -; SSE2-NEXT: LBB15_30: ## %else28 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB15_32 -; SSE2-NEXT: ## %bb.31: ## %cond.store29 -; SSE2-NEXT: movb %ah, 15(%rdi) +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: jne LBB15_29 +; SSE2-NEXT: ## %bb.30: ## %else28 +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: jne LBB15_31 ; SSE2-NEXT: LBB15_32: ## %else30 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB15_1: ## %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB15_4 +; SSE2-NEXT: LBB15_3: ## %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB15_6 +; SSE2-NEXT: LBB15_5: ## %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB15_7 +; SSE2-NEXT: jmp LBB15_8 +; SSE2-NEXT: LBB15_29: ## %cond.store27 +; SSE2-NEXT: movb %cl, 14(%rdi) +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE2-NEXT: je LBB15_32 +; SSE2-NEXT: LBB15_31: ## %cond.store29 +; SSE2-NEXT: movb %ch, 15(%rdi) +; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v16i8_v16i8: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pmovmskb %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB15_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB15_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB15_3 +; SSE4-NEXT: LBB15_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB15_5 +; SSE4-NEXT: LBB15_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB15_7 +; SSE4-NEXT: LBB15_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB15_9 +; SSE4-NEXT: LBB15_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB15_11 +; SSE4-NEXT: LBB15_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB15_13 +; SSE4-NEXT: LBB15_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB15_15 +; SSE4-NEXT: LBB15_16: ## %else14 +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 +; SSE4-NEXT: jne LBB15_17 +; SSE4-NEXT: LBB15_18: ## %else16 +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 +; SSE4-NEXT: jne LBB15_19 +; SSE4-NEXT: LBB15_20: ## %else18 +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE4-NEXT: jne LBB15_21 +; SSE4-NEXT: LBB15_22: ## %else20 +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE4-NEXT: jne LBB15_23 +; SSE4-NEXT: LBB15_24: ## %else22 +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE4-NEXT: jne LBB15_25 +; SSE4-NEXT: LBB15_26: ## %else24 +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE4-NEXT: jne LBB15_27 +; SSE4-NEXT: LBB15_28: ## %else26 +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE4-NEXT: jne LBB15_29 +; SSE4-NEXT: LBB15_30: ## %else28 +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE4-NEXT: jne LBB15_31 +; SSE4-NEXT: LBB15_32: ## %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB15_1: ## %cond.store ; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) -; SSE4-NEXT: LBB15_2: ## %else -; SSE4-NEXT: pextrb $1, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB15_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB15_3: ## %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) -; SSE4-NEXT: LBB15_4: ## %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $2, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB15_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB15_5: ## %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi) -; SSE4-NEXT: LBB15_6: ## %else4 -; SSE4-NEXT: pextrb $3, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB15_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB15_7: ## %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi) -; SSE4-NEXT: LBB15_8: ## %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB15_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB15_9: ## %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi) -; SSE4-NEXT: LBB15_10: ## %else8 -; SSE4-NEXT: pextrb $5, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB15_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB15_11: ## %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi) -; SSE4-NEXT: LBB15_12: ## %else10 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $6, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB15_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB15_13: ## %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi) -; SSE4-NEXT: LBB15_14: ## %else12 -; SSE4-NEXT: pextrb $7, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB15_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB15_15: ## %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi) -; SSE4-NEXT: LBB15_16: ## %else14 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 ; SSE4-NEXT: je LBB15_18 -; SSE4-NEXT: ## %bb.17: ## %cond.store15 +; SSE4-NEXT: LBB15_17: ## %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm1, 8(%rdi) -; SSE4-NEXT: LBB15_18: ## %else16 -; SSE4-NEXT: pextrb $9, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 ; SSE4-NEXT: je LBB15_20 -; SSE4-NEXT: ## %bb.19: ## %cond.store17 +; SSE4-NEXT: LBB15_19: ## %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm1, 9(%rdi) -; SSE4-NEXT: LBB15_20: ## %else18 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $10, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE4-NEXT: je LBB15_22 -; SSE4-NEXT: ## %bb.21: ## %cond.store19 +; SSE4-NEXT: LBB15_21: ## %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm1, 10(%rdi) -; SSE4-NEXT: LBB15_22: ## %else20 -; SSE4-NEXT: pextrb $11, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE4-NEXT: je LBB15_24 -; SSE4-NEXT: ## %bb.23: ## %cond.store21 +; SSE4-NEXT: LBB15_23: ## %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm1, 11(%rdi) -; SSE4-NEXT: LBB15_24: ## %else22 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE4-NEXT: je LBB15_26 -; SSE4-NEXT: ## %bb.25: ## %cond.store23 +; SSE4-NEXT: LBB15_25: ## %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm1, 12(%rdi) -; SSE4-NEXT: LBB15_26: ## %else24 -; SSE4-NEXT: pextrb $13, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE4-NEXT: je LBB15_28 -; SSE4-NEXT: ## %bb.27: ## %cond.store25 +; SSE4-NEXT: LBB15_27: ## %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm1, 13(%rdi) -; SSE4-NEXT: LBB15_28: ## %else26 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE4-NEXT: pextrb $14, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE4-NEXT: je LBB15_30 -; SSE4-NEXT: ## %bb.29: ## %cond.store27 +; SSE4-NEXT: LBB15_29: ## %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm1, 14(%rdi) -; SSE4-NEXT: LBB15_30: ## %else28 -; SSE4-NEXT: pextrb $15, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE4-NEXT: je LBB15_32 -; SSE4-NEXT: ## %bb.31: ## %cond.store29 +; SSE4-NEXT: LBB15_31: ## %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm1, 15(%rdi) -; SSE4-NEXT: LBB15_32: ## %else30 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: store_v16i8_v16i8: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $0, %xmm2, %eax +; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax ; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je LBB15_2 -; AVX1OR2-NEXT: ## %bb.1: ## %cond.store +; AVX1OR2-NEXT: jne LBB15_1 +; AVX1OR2-NEXT: ## %bb.2: ## %else +; AVX1OR2-NEXT: testb $2, %al +; AVX1OR2-NEXT: jne LBB15_3 +; AVX1OR2-NEXT: LBB15_4: ## %else2 +; AVX1OR2-NEXT: testb $4, %al +; AVX1OR2-NEXT: jne LBB15_5 +; AVX1OR2-NEXT: LBB15_6: ## %else4 +; AVX1OR2-NEXT: testb $8, %al +; AVX1OR2-NEXT: jne LBB15_7 +; AVX1OR2-NEXT: LBB15_8: ## %else6 +; AVX1OR2-NEXT: testb $16, %al +; AVX1OR2-NEXT: jne LBB15_9 +; AVX1OR2-NEXT: LBB15_10: ## %else8 +; AVX1OR2-NEXT: testb $32, %al +; AVX1OR2-NEXT: jne LBB15_11 +; AVX1OR2-NEXT: LBB15_12: ## %else10 +; AVX1OR2-NEXT: testb $64, %al +; AVX1OR2-NEXT: jne LBB15_13 +; AVX1OR2-NEXT: LBB15_14: ## %else12 +; AVX1OR2-NEXT: testb $-128, %al +; AVX1OR2-NEXT: jne LBB15_15 +; AVX1OR2-NEXT: LBB15_16: ## %else14 +; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1OR2-NEXT: jne LBB15_17 +; AVX1OR2-NEXT: LBB15_18: ## %else16 +; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1OR2-NEXT: jne LBB15_19 +; AVX1OR2-NEXT: LBB15_20: ## %else18 +; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1OR2-NEXT: jne LBB15_21 +; AVX1OR2-NEXT: LBB15_22: ## %else20 +; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1OR2-NEXT: jne LBB15_23 +; AVX1OR2-NEXT: LBB15_24: ## %else22 +; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1OR2-NEXT: jne LBB15_25 +; AVX1OR2-NEXT: LBB15_26: ## %else24 +; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1OR2-NEXT: jne LBB15_27 +; AVX1OR2-NEXT: LBB15_28: ## %else26 +; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1OR2-NEXT: jne LBB15_29 +; AVX1OR2-NEXT: LBB15_30: ## %else28 +; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1OR2-NEXT: jne LBB15_31 +; AVX1OR2-NEXT: LBB15_32: ## %else30 +; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: LBB15_1: ## %cond.store ; AVX1OR2-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX1OR2-NEXT: LBB15_2: ## %else -; AVX1OR2-NEXT: vpextrb $1, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $2, %al ; AVX1OR2-NEXT: je LBB15_4 -; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1 +; AVX1OR2-NEXT: LBB15_3: ## %cond.store1 ; AVX1OR2-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX1OR2-NEXT: LBB15_4: ## %else2 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $4, %al ; AVX1OR2-NEXT: je LBB15_6 -; AVX1OR2-NEXT: ## %bb.5: ## %cond.store3 +; AVX1OR2-NEXT: LBB15_5: ## %cond.store3 ; AVX1OR2-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX1OR2-NEXT: LBB15_6: ## %else4 -; AVX1OR2-NEXT: vpextrb $3, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $8, %al ; AVX1OR2-NEXT: je LBB15_8 -; AVX1OR2-NEXT: ## %bb.7: ## %cond.store5 +; AVX1OR2-NEXT: LBB15_7: ## %cond.store5 ; AVX1OR2-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX1OR2-NEXT: LBB15_8: ## %else6 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $16, %al ; AVX1OR2-NEXT: je LBB15_10 -; AVX1OR2-NEXT: ## %bb.9: ## %cond.store7 +; AVX1OR2-NEXT: LBB15_9: ## %cond.store7 ; AVX1OR2-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX1OR2-NEXT: LBB15_10: ## %else8 -; AVX1OR2-NEXT: vpextrb $5, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $32, %al ; AVX1OR2-NEXT: je LBB15_12 -; AVX1OR2-NEXT: ## %bb.11: ## %cond.store9 +; AVX1OR2-NEXT: LBB15_11: ## %cond.store9 ; AVX1OR2-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX1OR2-NEXT: LBB15_12: ## %else10 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: je LBB15_14 -; AVX1OR2-NEXT: ## %bb.13: ## %cond.store11 +; AVX1OR2-NEXT: LBB15_13: ## %cond.store11 ; AVX1OR2-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX1OR2-NEXT: LBB15_14: ## %else12 -; AVX1OR2-NEXT: vpextrb $7, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testb $-128, %al ; AVX1OR2-NEXT: je LBB15_16 -; AVX1OR2-NEXT: ## %bb.15: ## %cond.store13 +; AVX1OR2-NEXT: LBB15_15: ## %cond.store13 ; AVX1OR2-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX1OR2-NEXT: LBB15_16: ## %else14 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1OR2-NEXT: je LBB15_18 -; AVX1OR2-NEXT: ## %bb.17: ## %cond.store15 +; AVX1OR2-NEXT: LBB15_17: ## %cond.store15 ; AVX1OR2-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX1OR2-NEXT: LBB15_18: ## %else16 -; AVX1OR2-NEXT: vpextrb $9, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX1OR2-NEXT: je LBB15_20 -; AVX1OR2-NEXT: ## %bb.19: ## %cond.store17 +; AVX1OR2-NEXT: LBB15_19: ## %cond.store17 ; AVX1OR2-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX1OR2-NEXT: LBB15_20: ## %else18 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1OR2-NEXT: je LBB15_22 -; AVX1OR2-NEXT: ## %bb.21: ## %cond.store19 +; AVX1OR2-NEXT: LBB15_21: ## %cond.store19 ; AVX1OR2-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX1OR2-NEXT: LBB15_22: ## %else20 -; AVX1OR2-NEXT: vpextrb $11, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1OR2-NEXT: je LBB15_24 -; AVX1OR2-NEXT: ## %bb.23: ## %cond.store21 +; AVX1OR2-NEXT: LBB15_23: ## %cond.store21 ; AVX1OR2-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX1OR2-NEXT: LBB15_24: ## %else22 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1OR2-NEXT: je LBB15_26 -; AVX1OR2-NEXT: ## %bb.25: ## %cond.store23 +; AVX1OR2-NEXT: LBB15_25: ## %cond.store23 ; AVX1OR2-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX1OR2-NEXT: LBB15_26: ## %else24 -; AVX1OR2-NEXT: vpextrb $13, %xmm2, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1OR2-NEXT: je LBB15_28 -; AVX1OR2-NEXT: ## %bb.27: ## %cond.store25 +; AVX1OR2-NEXT: LBB15_27: ## %cond.store25 ; AVX1OR2-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX1OR2-NEXT: LBB15_28: ## %else26 -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpextrb $14, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1OR2-NEXT: je LBB15_30 -; AVX1OR2-NEXT: ## %bb.29: ## %cond.store27 +; AVX1OR2-NEXT: LBB15_29: ## %cond.store27 ; AVX1OR2-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX1OR2-NEXT: LBB15_30: ## %else28 -; AVX1OR2-NEXT: vpextrb $15, %xmm0, %eax -; AVX1OR2-NEXT: testb $1, %al +; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1OR2-NEXT: je LBB15_32 -; AVX1OR2-NEXT: ## %bb.31: ## %cond.store29 +; AVX1OR2-NEXT: LBB15_31: ## %cond.store29 ; AVX1OR2-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX1OR2-NEXT: LBB15_32: ## %else30 ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: store_v16i8_v16i8: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovmskb %xmm0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB15_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.store +; AVX512F-NEXT: jne LBB15_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB15_3 +; AVX512F-NEXT: LBB15_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB15_5 +; AVX512F-NEXT: LBB15_6: ## %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB15_7 +; AVX512F-NEXT: LBB15_8: ## %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB15_9 +; AVX512F-NEXT: LBB15_10: ## %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB15_11 +; AVX512F-NEXT: LBB15_12: ## %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB15_13 +; AVX512F-NEXT: LBB15_14: ## %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB15_15 +; AVX512F-NEXT: LBB15_16: ## %else14 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: jne LBB15_17 +; AVX512F-NEXT: LBB15_18: ## %else16 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB15_19 +; AVX512F-NEXT: LBB15_20: ## %else18 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB15_21 +; AVX512F-NEXT: LBB15_22: ## %else20 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB15_23 +; AVX512F-NEXT: LBB15_24: ## %else22 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB15_25 +; AVX512F-NEXT: LBB15_26: ## %else24 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB15_27 +; AVX512F-NEXT: LBB15_28: ## %else26 +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB15_29 +; AVX512F-NEXT: LBB15_30: ## %else28 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB15_31 +; AVX512F-NEXT: LBB15_32: ## %else30 +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB15_1: ## %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX512F-NEXT: LBB15_2: ## %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB15_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.store1 +; AVX512F-NEXT: LBB15_3: ## %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX512F-NEXT: LBB15_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB15_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.store3 +; AVX512F-NEXT: LBB15_5: ## %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX512F-NEXT: LBB15_6: ## %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB15_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.store5 +; AVX512F-NEXT: LBB15_7: ## %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX512F-NEXT: LBB15_8: ## %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB15_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.store7 +; AVX512F-NEXT: LBB15_9: ## %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX512F-NEXT: LBB15_10: ## %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB15_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.store9 +; AVX512F-NEXT: LBB15_11: ## %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX512F-NEXT: LBB15_12: ## %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB15_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.store11 +; AVX512F-NEXT: LBB15_13: ## %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX512F-NEXT: LBB15_14: ## %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je LBB15_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.store13 +; AVX512F-NEXT: LBB15_15: ## %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX512F-NEXT: LBB15_16: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB15_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.store15 +; AVX512F-NEXT: LBB15_17: ## %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX512F-NEXT: LBB15_18: ## %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB15_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.store17 +; AVX512F-NEXT: LBB15_19: ## %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX512F-NEXT: LBB15_20: ## %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB15_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.store19 +; AVX512F-NEXT: LBB15_21: ## %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX512F-NEXT: LBB15_22: ## %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB15_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.store21 +; AVX512F-NEXT: LBB15_23: ## %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX512F-NEXT: LBB15_24: ## %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB15_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.store23 +; AVX512F-NEXT: LBB15_25: ## %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX512F-NEXT: LBB15_26: ## %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB15_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.store25 +; AVX512F-NEXT: LBB15_27: ## %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX512F-NEXT: LBB15_28: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB15_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.store27 +; AVX512F-NEXT: LBB15_29: ## %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX512F-NEXT: LBB15_30: ## %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB15_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.store29 +; AVX512F-NEXT: LBB15_31: ## %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX512F-NEXT: LBB15_32: ## %else30 -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: store_v16i8_v16i8: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB15_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: jne LBB15_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB15_3 +; AVX512VLDQ-NEXT: LBB15_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB15_5 +; AVX512VLDQ-NEXT: LBB15_6: ## %else4 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB15_7 +; AVX512VLDQ-NEXT: LBB15_8: ## %else6 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB15_9 +; AVX512VLDQ-NEXT: LBB15_10: ## %else8 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB15_11 +; AVX512VLDQ-NEXT: LBB15_12: ## %else10 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB15_13 +; AVX512VLDQ-NEXT: LBB15_14: ## %else12 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB15_15 +; AVX512VLDQ-NEXT: LBB15_16: ## %else14 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: jne LBB15_17 +; AVX512VLDQ-NEXT: LBB15_18: ## %else16 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB15_19 +; AVX512VLDQ-NEXT: LBB15_20: ## %else18 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB15_21 +; AVX512VLDQ-NEXT: LBB15_22: ## %else20 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB15_23 +; AVX512VLDQ-NEXT: LBB15_24: ## %else22 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB15_25 +; AVX512VLDQ-NEXT: LBB15_26: ## %else24 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB15_27 +; AVX512VLDQ-NEXT: LBB15_28: ## %else26 +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB15_29 +; AVX512VLDQ-NEXT: LBB15_30: ## %else28 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB15_31 +; AVX512VLDQ-NEXT: LBB15_32: ## %else30 +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB15_1: ## %cond.store ; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX512VLDQ-NEXT: LBB15_2: ## %else -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB15_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: LBB15_3: ## %cond.store1 ; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX512VLDQ-NEXT: LBB15_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB15_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: LBB15_5: ## %cond.store3 ; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX512VLDQ-NEXT: LBB15_6: ## %else4 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB15_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: LBB15_7: ## %cond.store5 ; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX512VLDQ-NEXT: LBB15_8: ## %else6 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB15_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: LBB15_9: ## %cond.store7 ; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX512VLDQ-NEXT: LBB15_10: ## %else8 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB15_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: LBB15_11: ## %cond.store9 ; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX512VLDQ-NEXT: LBB15_12: ## %else10 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB15_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: LBB15_13: ## %cond.store11 ; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX512VLDQ-NEXT: LBB15_14: ## %else12 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al ; AVX512VLDQ-NEXT: je LBB15_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: LBB15_15: ## %cond.store13 ; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX512VLDQ-NEXT: LBB15_16: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB15_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 +; AVX512VLDQ-NEXT: LBB15_17: ## %cond.store15 ; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX512VLDQ-NEXT: LBB15_18: ## %else16 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB15_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 +; AVX512VLDQ-NEXT: LBB15_19: ## %cond.store17 ; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX512VLDQ-NEXT: LBB15_20: ## %else18 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB15_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 +; AVX512VLDQ-NEXT: LBB15_21: ## %cond.store19 ; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX512VLDQ-NEXT: LBB15_22: ## %else20 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB15_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 +; AVX512VLDQ-NEXT: LBB15_23: ## %cond.store21 ; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX512VLDQ-NEXT: LBB15_24: ## %else22 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB15_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 +; AVX512VLDQ-NEXT: LBB15_25: ## %cond.store23 ; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX512VLDQ-NEXT: LBB15_26: ## %else24 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB15_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 +; AVX512VLDQ-NEXT: LBB15_27: ## %cond.store25 ; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX512VLDQ-NEXT: LBB15_28: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB15_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 +; AVX512VLDQ-NEXT: LBB15_29: ## %cond.store27 ; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX512VLDQ-NEXT: LBB15_30: ## %else28 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB15_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 +; AVX512VLDQ-NEXT: LBB15_31: ## %cond.store29 ; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX512VLDQ-NEXT: LBB15_32: ## %else30 -; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: store_v16i8_v16i8: @@ -3319,1592 +3117,1375 @@ ; SSE2-LABEL: store_v32i8_v32i8: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: je LBB16_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: LBB16_2: ## %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB16_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx +; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: jne LBB16_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB16_3 ; SSE2-NEXT: LBB16_4: ## %else2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je LBB16_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB16_5 ; SSE2-NEXT: LBB16_6: ## %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB16_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: LBB16_7: ## %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: LBB16_8: ## %else6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: pextrw $2, %xmm2, %ecx ; SSE2-NEXT: je LBB16_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store7 -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: LBB16_10: ## %else8 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB16_12 ; SSE2-NEXT: ## %bb.11: ## %cond.store9 -; SSE2-NEXT: movb %ah, 5(%rdi) +; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: LBB16_12: ## %else10 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $3, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $3, %xmm2, %eax +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: pextrw $3, %xmm2, %ecx ; SSE2-NEXT: je LBB16_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store11 -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: LBB16_14: ## %else12 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB16_16 ; SSE2-NEXT: ## %bb.15: ## %cond.store13 -; SSE2-NEXT: movb %ah, 7(%rdi) +; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: LBB16_16: ## %else14 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $4, %xmm2, %eax +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx ; SSE2-NEXT: je LBB16_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store15 -; SSE2-NEXT: movb %al, 8(%rdi) +; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: LBB16_18: ## %else16 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $512, %eax ## imm = 0x200 ; SSE2-NEXT: je LBB16_20 ; SSE2-NEXT: ## %bb.19: ## %cond.store17 -; SSE2-NEXT: movb %ah, 9(%rdi) +; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: LBB16_20: ## %else18 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $5, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $5, %xmm2, %eax +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE2-NEXT: pextrw $5, %xmm2, %ecx ; SSE2-NEXT: je LBB16_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store19 -; SSE2-NEXT: movb %al, 10(%rdi) +; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: LBB16_22: ## %else20 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB16_24 ; SSE2-NEXT: ## %bb.23: ## %cond.store21 -; SSE2-NEXT: movb %ah, 11(%rdi) +; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: LBB16_24: ## %else22 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $6, %xmm2, %eax +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx ; SSE2-NEXT: je LBB16_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store23 -; SSE2-NEXT: movb %al, 12(%rdi) +; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: LBB16_26: ## %else24 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE2-NEXT: je LBB16_28 ; SSE2-NEXT: ## %bb.27: ## %cond.store25 -; SSE2-NEXT: movb %ah, 13(%rdi) +; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: LBB16_28: ## %else26 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 -; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $7, %xmm2, %eax +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE2-NEXT: pextrw $7, %xmm2, %ecx ; SSE2-NEXT: je LBB16_30 ; SSE2-NEXT: ## %bb.29: ## %cond.store27 -; SSE2-NEXT: movb %al, 14(%rdi) +; SSE2-NEXT: movb %cl, 14(%rdi) ; SSE2-NEXT: LBB16_30: ## %else28 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE2-NEXT: je LBB16_32 ; SSE2-NEXT: ## %bb.31: ## %cond.store29 -; SSE2-NEXT: movb %ah, 15(%rdi) +; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: LBB16_32: ## %else30 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: je LBB16_34 -; SSE2-NEXT: ## %bb.33: ## %cond.store31 -; SSE2-NEXT: movb %al, 16(%rdi) -; SSE2-NEXT: LBB16_34: ## %else32 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB16_36 -; SSE2-NEXT: ## %bb.35: ## %cond.store33 -; SSE2-NEXT: movb %ah, 17(%rdi) +; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: jne LBB16_33 +; SSE2-NEXT: ## %bb.34: ## %else32 +; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE2-NEXT: jne LBB16_35 ; SSE2-NEXT: LBB16_36: ## %else34 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je LBB16_38 -; SSE2-NEXT: ## %bb.37: ## %cond.store35 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE2-NEXT: jne LBB16_37 ; SSE2-NEXT: LBB16_38: ## %else36 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 ; SSE2-NEXT: je LBB16_40 -; SSE2-NEXT: ## %bb.39: ## %cond.store37 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 19(%rdi) +; SSE2-NEXT: LBB16_39: ## %cond.store37 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: LBB16_40: ## %else38 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx ; SSE2-NEXT: je LBB16_42 ; SSE2-NEXT: ## %bb.41: ## %cond.store39 -; SSE2-NEXT: movb %al, 20(%rdi) +; SSE2-NEXT: movb %cl, 20(%rdi) ; SSE2-NEXT: LBB16_42: ## %else40 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; SSE2-NEXT: je LBB16_44 ; SSE2-NEXT: ## %bb.43: ## %cond.store41 -; SSE2-NEXT: movb %ah, 21(%rdi) +; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: LBB16_44: ## %else42 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $3, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $3, %xmm3, %eax +; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 +; SSE2-NEXT: pextrw $3, %xmm3, %ecx ; SSE2-NEXT: je LBB16_46 ; SSE2-NEXT: ## %bb.45: ## %cond.store43 -; SSE2-NEXT: movb %al, 22(%rdi) +; SSE2-NEXT: movb %cl, 22(%rdi) ; SSE2-NEXT: LBB16_46: ## %else44 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; SSE2-NEXT: je LBB16_48 ; SSE2-NEXT: ## %bb.47: ## %cond.store45 -; SSE2-NEXT: movb %ah, 23(%rdi) +; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: LBB16_48: ## %else46 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $4, %xmm3, %eax +; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx ; SSE2-NEXT: je LBB16_50 ; SSE2-NEXT: ## %bb.49: ## %cond.store47 -; SSE2-NEXT: movb %al, 24(%rdi) +; SSE2-NEXT: movb %cl, 24(%rdi) ; SSE2-NEXT: LBB16_50: ## %else48 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; SSE2-NEXT: je LBB16_52 ; SSE2-NEXT: ## %bb.51: ## %cond.store49 -; SSE2-NEXT: movb %ah, 25(%rdi) +; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: LBB16_52: ## %else50 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $5, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $5, %xmm3, %eax +; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; SSE2-NEXT: pextrw $5, %xmm3, %ecx ; SSE2-NEXT: je LBB16_54 ; SSE2-NEXT: ## %bb.53: ## %cond.store51 -; SSE2-NEXT: movb %al, 26(%rdi) +; SSE2-NEXT: movb %cl, 26(%rdi) ; SSE2-NEXT: LBB16_54: ## %else52 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; SSE2-NEXT: je LBB16_56 ; SSE2-NEXT: ## %bb.55: ## %cond.store53 -; SSE2-NEXT: movb %ah, 27(%rdi) +; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: LBB16_56: ## %else54 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $6, %xmm3, %eax +; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx ; SSE2-NEXT: je LBB16_58 ; SSE2-NEXT: ## %bb.57: ## %cond.store55 -; SSE2-NEXT: movb %al, 28(%rdi) +; SSE2-NEXT: movb %cl, 28(%rdi) ; SSE2-NEXT: LBB16_58: ## %else56 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; SSE2-NEXT: je LBB16_60 ; SSE2-NEXT: ## %bb.59: ## %cond.store57 -; SSE2-NEXT: movb %ah, 29(%rdi) +; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: LBB16_60: ## %else58 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: pextrw $7, %xmm3, %eax -; SSE2-NEXT: je LBB16_62 -; SSE2-NEXT: ## %bb.61: ## %cond.store59 -; SSE2-NEXT: movb %al, 30(%rdi) -; SSE2-NEXT: LBB16_62: ## %else60 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je LBB16_64 -; SSE2-NEXT: ## %bb.63: ## %cond.store61 -; SSE2-NEXT: movb %ah, 31(%rdi) +; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; SSE2-NEXT: pextrw $7, %xmm3, %ecx +; SSE2-NEXT: jne LBB16_61 +; SSE2-NEXT: ## %bb.62: ## %else60 +; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE2-NEXT: jne LBB16_63 ; SSE2-NEXT: LBB16_64: ## %else62 ; SSE2-NEXT: retq +; SSE2-NEXT: LBB16_1: ## %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je LBB16_4 +; SSE2-NEXT: LBB16_3: ## %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je LBB16_6 +; SSE2-NEXT: LBB16_5: ## %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB16_7 +; SSE2-NEXT: jmp LBB16_8 +; SSE2-NEXT: LBB16_33: ## %cond.store31 +; SSE2-NEXT: movb %cl, 16(%rdi) +; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE2-NEXT: je LBB16_36 +; SSE2-NEXT: LBB16_35: ## %cond.store33 +; SSE2-NEXT: movb %ch, 17(%rdi) +; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE2-NEXT: je LBB16_38 +; SSE2-NEXT: LBB16_37: ## %cond.store35 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 +; SSE2-NEXT: jne LBB16_39 +; SSE2-NEXT: jmp LBB16_40 +; SSE2-NEXT: LBB16_61: ## %cond.store59 +; SSE2-NEXT: movb %cl, 30(%rdi) +; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE2-NEXT: je LBB16_64 +; SSE2-NEXT: LBB16_63: ## %cond.store61 +; SSE2-NEXT: movb %ch, 31(%rdi) +; SSE2-NEXT: retq ; ; SSE4-LABEL: store_v32i8_v32i8: ; SSE4: ## %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $0, %xmm4, %eax +; SSE4-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE4-NEXT: pmovmskb %xmm0, %ecx +; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 +; SSE4-NEXT: pmovmskb %xmm1, %eax +; SSE4-NEXT: shll $16, %eax +; SSE4-NEXT: orl %ecx, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB16_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB16_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB16_3 +; SSE4-NEXT: LBB16_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB16_5 +; SSE4-NEXT: LBB16_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB16_7 +; SSE4-NEXT: LBB16_8: ## %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne LBB16_9 +; SSE4-NEXT: LBB16_10: ## %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne LBB16_11 +; SSE4-NEXT: LBB16_12: ## %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne LBB16_13 +; SSE4-NEXT: LBB16_14: ## %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne LBB16_15 +; SSE4-NEXT: LBB16_16: ## %else14 +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 +; SSE4-NEXT: jne LBB16_17 +; SSE4-NEXT: LBB16_18: ## %else16 +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 +; SSE4-NEXT: jne LBB16_19 +; SSE4-NEXT: LBB16_20: ## %else18 +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 +; SSE4-NEXT: jne LBB16_21 +; SSE4-NEXT: LBB16_22: ## %else20 +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 +; SSE4-NEXT: jne LBB16_23 +; SSE4-NEXT: LBB16_24: ## %else22 +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 +; SSE4-NEXT: jne LBB16_25 +; SSE4-NEXT: LBB16_26: ## %else24 +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 +; SSE4-NEXT: jne LBB16_27 +; SSE4-NEXT: LBB16_28: ## %else26 +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 +; SSE4-NEXT: jne LBB16_29 +; SSE4-NEXT: LBB16_30: ## %else28 +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 +; SSE4-NEXT: jne LBB16_31 +; SSE4-NEXT: LBB16_32: ## %else30 +; SSE4-NEXT: testl $65536, %eax ## imm = 0x10000 +; SSE4-NEXT: jne LBB16_33 +; SSE4-NEXT: LBB16_34: ## %else32 +; SSE4-NEXT: testl $131072, %eax ## imm = 0x20000 +; SSE4-NEXT: jne LBB16_35 +; SSE4-NEXT: LBB16_36: ## %else34 +; SSE4-NEXT: testl $262144, %eax ## imm = 0x40000 +; SSE4-NEXT: jne LBB16_37 +; SSE4-NEXT: LBB16_38: ## %else36 +; SSE4-NEXT: testl $524288, %eax ## imm = 0x80000 +; SSE4-NEXT: jne LBB16_39 +; SSE4-NEXT: LBB16_40: ## %else38 +; SSE4-NEXT: testl $1048576, %eax ## imm = 0x100000 +; SSE4-NEXT: jne LBB16_41 +; SSE4-NEXT: LBB16_42: ## %else40 +; SSE4-NEXT: testl $2097152, %eax ## imm = 0x200000 +; SSE4-NEXT: jne LBB16_43 +; SSE4-NEXT: LBB16_44: ## %else42 +; SSE4-NEXT: testl $4194304, %eax ## imm = 0x400000 +; SSE4-NEXT: jne LBB16_45 +; SSE4-NEXT: LBB16_46: ## %else44 +; SSE4-NEXT: testl $8388608, %eax ## imm = 0x800000 +; SSE4-NEXT: jne LBB16_47 +; SSE4-NEXT: LBB16_48: ## %else46 +; SSE4-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; SSE4-NEXT: jne LBB16_49 +; SSE4-NEXT: LBB16_50: ## %else48 +; SSE4-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; SSE4-NEXT: jne LBB16_51 +; SSE4-NEXT: LBB16_52: ## %else50 +; SSE4-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; SSE4-NEXT: jne LBB16_53 +; SSE4-NEXT: LBB16_54: ## %else52 +; SSE4-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; SSE4-NEXT: jne LBB16_55 +; SSE4-NEXT: LBB16_56: ## %else54 +; SSE4-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; SSE4-NEXT: jne LBB16_57 +; SSE4-NEXT: LBB16_58: ## %else56 +; SSE4-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; SSE4-NEXT: jne LBB16_59 +; SSE4-NEXT: LBB16_60: ## %else58 +; SSE4-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; SSE4-NEXT: jne LBB16_61 +; SSE4-NEXT: LBB16_62: ## %else60 +; SSE4-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; SSE4-NEXT: jne LBB16_63 +; SSE4-NEXT: LBB16_64: ## %else62 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB16_1: ## %cond.store ; SSE4-NEXT: pextrb $0, %xmm2, (%rdi) -; SSE4-NEXT: LBB16_2: ## %else -; SSE4-NEXT: pextrb $1, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB16_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB16_3: ## %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) -; SSE4-NEXT: LBB16_4: ## %else2 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $2, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB16_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB16_5: ## %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm2, 2(%rdi) -; SSE4-NEXT: LBB16_6: ## %else4 -; SSE4-NEXT: pextrb $3, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB16_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB16_7: ## %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm2, 3(%rdi) -; SSE4-NEXT: LBB16_8: ## %else6 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je LBB16_10 -; SSE4-NEXT: ## %bb.9: ## %cond.store7 +; SSE4-NEXT: LBB16_9: ## %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm2, 4(%rdi) -; SSE4-NEXT: LBB16_10: ## %else8 -; SSE4-NEXT: pextrb $5, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je LBB16_12 -; SSE4-NEXT: ## %bb.11: ## %cond.store9 +; SSE4-NEXT: LBB16_11: ## %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm2, 5(%rdi) -; SSE4-NEXT: LBB16_12: ## %else10 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $6, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je LBB16_14 -; SSE4-NEXT: ## %bb.13: ## %cond.store11 +; SSE4-NEXT: LBB16_13: ## %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm2, 6(%rdi) -; SSE4-NEXT: LBB16_14: ## %else12 -; SSE4-NEXT: pextrb $7, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je LBB16_16 -; SSE4-NEXT: ## %bb.15: ## %cond.store13 +; SSE4-NEXT: LBB16_15: ## %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm2, 7(%rdi) -; SSE4-NEXT: LBB16_16: ## %else14 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $8, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax ## imm = 0x100 ; SSE4-NEXT: je LBB16_18 -; SSE4-NEXT: ## %bb.17: ## %cond.store15 +; SSE4-NEXT: LBB16_17: ## %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm2, 8(%rdi) -; SSE4-NEXT: LBB16_18: ## %else16 -; SSE4-NEXT: pextrb $9, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax ## imm = 0x200 ; SSE4-NEXT: je LBB16_20 -; SSE4-NEXT: ## %bb.19: ## %cond.store17 +; SSE4-NEXT: LBB16_19: ## %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm2, 9(%rdi) -; SSE4-NEXT: LBB16_20: ## %else18 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $10, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE4-NEXT: je LBB16_22 -; SSE4-NEXT: ## %bb.21: ## %cond.store19 +; SSE4-NEXT: LBB16_21: ## %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm2, 10(%rdi) -; SSE4-NEXT: LBB16_22: ## %else20 -; SSE4-NEXT: pextrb $11, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE4-NEXT: je LBB16_24 -; SSE4-NEXT: ## %bb.23: ## %cond.store21 +; SSE4-NEXT: LBB16_23: ## %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm2, 11(%rdi) -; SSE4-NEXT: LBB16_24: ## %else22 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE4-NEXT: pextrb $12, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE4-NEXT: je LBB16_26 -; SSE4-NEXT: ## %bb.25: ## %cond.store23 +; SSE4-NEXT: LBB16_25: ## %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm2, 12(%rdi) -; SSE4-NEXT: LBB16_26: ## %else24 -; SSE4-NEXT: pextrb $13, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 ; SSE4-NEXT: je LBB16_28 -; SSE4-NEXT: ## %bb.27: ## %cond.store25 +; SSE4-NEXT: LBB16_27: ## %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm2, 13(%rdi) -; SSE4-NEXT: LBB16_28: ## %else26 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm0 -; SSE4-NEXT: pextrb $14, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE4-NEXT: je LBB16_30 -; SSE4-NEXT: ## %bb.29: ## %cond.store27 +; SSE4-NEXT: LBB16_29: ## %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm2, 14(%rdi) -; SSE4-NEXT: LBB16_30: ## %else28 -; SSE4-NEXT: pextrb $15, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE4-NEXT: je LBB16_32 -; SSE4-NEXT: ## %bb.31: ## %cond.store29 +; SSE4-NEXT: LBB16_31: ## %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm2, 15(%rdi) -; SSE4-NEXT: LBB16_32: ## %else30 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $65536, %eax ## imm = 0x10000 ; SSE4-NEXT: je LBB16_34 -; SSE4-NEXT: ## %bb.33: ## %cond.store31 +; SSE4-NEXT: LBB16_33: ## %cond.store31 ; SSE4-NEXT: pextrb $0, %xmm3, 16(%rdi) -; SSE4-NEXT: LBB16_34: ## %else32 -; SSE4-NEXT: pextrb $1, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $131072, %eax ## imm = 0x20000 ; SSE4-NEXT: je LBB16_36 -; SSE4-NEXT: ## %bb.35: ## %cond.store33 +; SSE4-NEXT: LBB16_35: ## %cond.store33 ; SSE4-NEXT: pextrb $1, %xmm3, 17(%rdi) -; SSE4-NEXT: LBB16_36: ## %else34 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $2, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $262144, %eax ## imm = 0x40000 ; SSE4-NEXT: je LBB16_38 -; SSE4-NEXT: ## %bb.37: ## %cond.store35 +; SSE4-NEXT: LBB16_37: ## %cond.store35 ; SSE4-NEXT: pextrb $2, %xmm3, 18(%rdi) -; SSE4-NEXT: LBB16_38: ## %else36 -; SSE4-NEXT: pextrb $3, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $524288, %eax ## imm = 0x80000 ; SSE4-NEXT: je LBB16_40 -; SSE4-NEXT: ## %bb.39: ## %cond.store37 +; SSE4-NEXT: LBB16_39: ## %cond.store37 ; SSE4-NEXT: pextrb $3, %xmm3, 19(%rdi) -; SSE4-NEXT: LBB16_40: ## %else38 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1048576, %eax ## imm = 0x100000 ; SSE4-NEXT: je LBB16_42 -; SSE4-NEXT: ## %bb.41: ## %cond.store39 +; SSE4-NEXT: LBB16_41: ## %cond.store39 ; SSE4-NEXT: pextrb $4, %xmm3, 20(%rdi) -; SSE4-NEXT: LBB16_42: ## %else40 -; SSE4-NEXT: pextrb $5, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2097152, %eax ## imm = 0x200000 ; SSE4-NEXT: je LBB16_44 -; SSE4-NEXT: ## %bb.43: ## %cond.store41 +; SSE4-NEXT: LBB16_43: ## %cond.store41 ; SSE4-NEXT: pextrb $5, %xmm3, 21(%rdi) -; SSE4-NEXT: LBB16_44: ## %else42 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $6, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE4-NEXT: je LBB16_46 -; SSE4-NEXT: ## %bb.45: ## %cond.store43 +; SSE4-NEXT: LBB16_45: ## %cond.store43 ; SSE4-NEXT: pextrb $6, %xmm3, 22(%rdi) -; SSE4-NEXT: LBB16_46: ## %else44 -; SSE4-NEXT: pextrb $7, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8388608, %eax ## imm = 0x800000 ; SSE4-NEXT: je LBB16_48 -; SSE4-NEXT: ## %bb.47: ## %cond.store45 +; SSE4-NEXT: LBB16_47: ## %cond.store45 ; SSE4-NEXT: pextrb $7, %xmm3, 23(%rdi) -; SSE4-NEXT: LBB16_48: ## %else46 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; SSE4-NEXT: je LBB16_50 -; SSE4-NEXT: ## %bb.49: ## %cond.store47 +; SSE4-NEXT: LBB16_49: ## %cond.store47 ; SSE4-NEXT: pextrb $8, %xmm3, 24(%rdi) -; SSE4-NEXT: LBB16_50: ## %else48 -; SSE4-NEXT: pextrb $9, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; SSE4-NEXT: je LBB16_52 -; SSE4-NEXT: ## %bb.51: ## %cond.store49 +; SSE4-NEXT: LBB16_51: ## %cond.store49 ; SSE4-NEXT: pextrb $9, %xmm3, 25(%rdi) -; SSE4-NEXT: LBB16_52: ## %else50 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $10, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE4-NEXT: je LBB16_54 -; SSE4-NEXT: ## %bb.53: ## %cond.store51 +; SSE4-NEXT: LBB16_53: ## %cond.store51 ; SSE4-NEXT: pextrb $10, %xmm3, 26(%rdi) -; SSE4-NEXT: LBB16_54: ## %else52 -; SSE4-NEXT: pextrb $11, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; SSE4-NEXT: je LBB16_56 -; SSE4-NEXT: ## %bb.55: ## %cond.store53 +; SSE4-NEXT: LBB16_55: ## %cond.store53 ; SSE4-NEXT: pextrb $11, %xmm3, 27(%rdi) -; SSE4-NEXT: LBB16_56: ## %else54 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; SSE4-NEXT: je LBB16_58 -; SSE4-NEXT: ## %bb.57: ## %cond.store55 +; SSE4-NEXT: LBB16_57: ## %cond.store55 ; SSE4-NEXT: pextrb $12, %xmm3, 28(%rdi) -; SSE4-NEXT: LBB16_58: ## %else56 -; SSE4-NEXT: pextrb $13, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; SSE4-NEXT: je LBB16_60 -; SSE4-NEXT: ## %bb.59: ## %cond.store57 +; SSE4-NEXT: LBB16_59: ## %cond.store57 ; SSE4-NEXT: pextrb $13, %xmm3, 29(%rdi) -; SSE4-NEXT: LBB16_60: ## %else58 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE4-NEXT: pextrb $14, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE4-NEXT: je LBB16_62 -; SSE4-NEXT: ## %bb.61: ## %cond.store59 +; SSE4-NEXT: LBB16_61: ## %cond.store59 ; SSE4-NEXT: pextrb $14, %xmm3, 30(%rdi) -; SSE4-NEXT: LBB16_62: ## %else60 -; SSE4-NEXT: pextrb $15, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; SSE4-NEXT: je LBB16_64 -; SSE4-NEXT: ## %bb.63: ## %cond.store61 +; SSE4-NEXT: LBB16_63: ## %cond.store61 ; SSE4-NEXT: pextrb $15, %xmm3, 31(%rdi) -; SSE4-NEXT: LBB16_64: ## %else62 ; SSE4-NEXT: retq ; ; AVX1-LABEL: store_v32i8_v32i8: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB16_2 -; AVX1-NEXT: ## %bb.1: ## %cond.store +; AVX1-NEXT: vpmovmskb %xmm3, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB16_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB16_3 +; AVX1-NEXT: LBB16_4: ## %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB16_5 +; AVX1-NEXT: LBB16_6: ## %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB16_7 +; AVX1-NEXT: LBB16_8: ## %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne LBB16_9 +; AVX1-NEXT: LBB16_10: ## %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne LBB16_11 +; AVX1-NEXT: LBB16_12: ## %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne LBB16_13 +; AVX1-NEXT: LBB16_14: ## %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne LBB16_15 +; AVX1-NEXT: LBB16_16: ## %else14 +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 +; AVX1-NEXT: jne LBB16_17 +; AVX1-NEXT: LBB16_18: ## %else16 +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 +; AVX1-NEXT: jne LBB16_19 +; AVX1-NEXT: LBB16_20: ## %else18 +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX1-NEXT: jne LBB16_21 +; AVX1-NEXT: LBB16_22: ## %else20 +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX1-NEXT: jne LBB16_23 +; AVX1-NEXT: LBB16_24: ## %else22 +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX1-NEXT: jne LBB16_25 +; AVX1-NEXT: LBB16_26: ## %else24 +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX1-NEXT: jne LBB16_27 +; AVX1-NEXT: LBB16_28: ## %else26 +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX1-NEXT: jne LBB16_29 +; AVX1-NEXT: LBB16_30: ## %else28 +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: je LBB16_32 +; AVX1-NEXT: LBB16_31: ## %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX1-NEXT: LBB16_32: ## %else30 +; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: jne LBB16_33 +; AVX1-NEXT: ## %bb.34: ## %else32 +; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX1-NEXT: jne LBB16_35 +; AVX1-NEXT: LBB16_36: ## %else34 +; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX1-NEXT: jne LBB16_37 +; AVX1-NEXT: LBB16_38: ## %else36 +; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX1-NEXT: jne LBB16_39 +; AVX1-NEXT: LBB16_40: ## %else38 +; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX1-NEXT: jne LBB16_41 +; AVX1-NEXT: LBB16_42: ## %else40 +; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX1-NEXT: jne LBB16_43 +; AVX1-NEXT: LBB16_44: ## %else42 +; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX1-NEXT: jne LBB16_45 +; AVX1-NEXT: LBB16_46: ## %else44 +; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX1-NEXT: jne LBB16_47 +; AVX1-NEXT: LBB16_48: ## %else46 +; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX1-NEXT: jne LBB16_49 +; AVX1-NEXT: LBB16_50: ## %else48 +; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX1-NEXT: jne LBB16_51 +; AVX1-NEXT: LBB16_52: ## %else50 +; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX1-NEXT: jne LBB16_53 +; AVX1-NEXT: LBB16_54: ## %else52 +; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX1-NEXT: jne LBB16_55 +; AVX1-NEXT: LBB16_56: ## %else54 +; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX1-NEXT: jne LBB16_57 +; AVX1-NEXT: LBB16_58: ## %else56 +; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX1-NEXT: jne LBB16_59 +; AVX1-NEXT: LBB16_60: ## %else58 +; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX1-NEXT: jne LBB16_61 +; AVX1-NEXT: LBB16_62: ## %else60 +; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX1-NEXT: jne LBB16_63 +; AVX1-NEXT: LBB16_64: ## %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: LBB16_1: ## %cond.store ; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX1-NEXT: LBB16_2: ## %else -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je LBB16_4 -; AVX1-NEXT: ## %bb.3: ## %cond.store1 +; AVX1-NEXT: LBB16_3: ## %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX1-NEXT: LBB16_4: ## %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB16_6 -; AVX1-NEXT: ## %bb.5: ## %cond.store3 +; AVX1-NEXT: LBB16_5: ## %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX1-NEXT: LBB16_6: ## %else4 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB16_8 -; AVX1-NEXT: ## %bb.7: ## %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX1-NEXT: LBB16_8: ## %else6 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: LBB16_7: ## %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB16_10 -; AVX1-NEXT: ## %bb.9: ## %cond.store7 +; AVX1-NEXT: LBB16_9: ## %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX1-NEXT: LBB16_10: ## %else8 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je LBB16_12 -; AVX1-NEXT: ## %bb.11: ## %cond.store9 +; AVX1-NEXT: LBB16_11: ## %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX1-NEXT: LBB16_12: ## %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $6, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB16_14 -; AVX1-NEXT: ## %bb.13: ## %cond.store11 +; AVX1-NEXT: LBB16_13: ## %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX1-NEXT: LBB16_14: ## %else12 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB16_16 -; AVX1-NEXT: ## %bb.15: ## %cond.store13 +; AVX1-NEXT: LBB16_15: ## %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX1-NEXT: LBB16_16: ## %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB16_18 -; AVX1-NEXT: ## %bb.17: ## %cond.store15 +; AVX1-NEXT: LBB16_17: ## %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX1-NEXT: LBB16_18: ## %else16 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax ## imm = 0x200 ; AVX1-NEXT: je LBB16_20 -; AVX1-NEXT: ## %bb.19: ## %cond.store17 +; AVX1-NEXT: LBB16_19: ## %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX1-NEXT: LBB16_20: ## %else18 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $10, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1-NEXT: je LBB16_22 -; AVX1-NEXT: ## %bb.21: ## %cond.store19 +; AVX1-NEXT: LBB16_21: ## %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX1-NEXT: LBB16_22: ## %else20 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1-NEXT: je LBB16_24 -; AVX1-NEXT: ## %bb.23: ## %cond.store21 +; AVX1-NEXT: LBB16_23: ## %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX1-NEXT: LBB16_24: ## %else22 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB16_26 -; AVX1-NEXT: ## %bb.25: ## %cond.store23 +; AVX1-NEXT: LBB16_25: ## %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX1-NEXT: LBB16_26: ## %else24 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1-NEXT: je LBB16_28 -; AVX1-NEXT: ## %bb.27: ## %cond.store25 +; AVX1-NEXT: LBB16_27: ## %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX1-NEXT: LBB16_28: ## %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $14, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB16_30 -; AVX1-NEXT: ## %bb.29: ## %cond.store27 +; AVX1-NEXT: LBB16_29: ## %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX1-NEXT: LBB16_30: ## %else28 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je LBB16_32 -; AVX1-NEXT: ## %bb.31: ## %cond.store29 -; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX1-NEXT: LBB16_32: ## %else30 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: je LBB16_34 -; AVX1-NEXT: ## %bb.33: ## %cond.store31 +; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX1-NEXT: jne LBB16_31 +; AVX1-NEXT: jmp LBB16_32 +; AVX1-NEXT: LBB16_33: ## %cond.store31 ; AVX1-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX1-NEXT: LBB16_34: ## %else32 -; AVX1-NEXT: vpextrb $1, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX1-NEXT: je LBB16_36 -; AVX1-NEXT: ## %bb.35: ## %cond.store33 +; AVX1-NEXT: LBB16_35: ## %cond.store33 ; AVX1-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX1-NEXT: LBB16_36: ## %else34 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $2, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX1-NEXT: je LBB16_38 -; AVX1-NEXT: ## %bb.37: ## %cond.store35 +; AVX1-NEXT: LBB16_37: ## %cond.store35 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX1-NEXT: LBB16_38: ## %else36 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX1-NEXT: je LBB16_40 -; AVX1-NEXT: ## %bb.39: ## %cond.store37 +; AVX1-NEXT: LBB16_39: ## %cond.store37 ; AVX1-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX1-NEXT: LBB16_40: ## %else38 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: je LBB16_42 -; AVX1-NEXT: ## %bb.41: ## %cond.store39 +; AVX1-NEXT: LBB16_41: ## %cond.store39 ; AVX1-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX1-NEXT: LBB16_42: ## %else40 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX1-NEXT: je LBB16_44 -; AVX1-NEXT: ## %bb.43: ## %cond.store41 +; AVX1-NEXT: LBB16_43: ## %cond.store41 ; AVX1-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX1-NEXT: LBB16_44: ## %else42 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX1-NEXT: je LBB16_46 -; AVX1-NEXT: ## %bb.45: ## %cond.store43 +; AVX1-NEXT: LBB16_45: ## %cond.store43 ; AVX1-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX1-NEXT: LBB16_46: ## %else44 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX1-NEXT: je LBB16_48 -; AVX1-NEXT: ## %bb.47: ## %cond.store45 +; AVX1-NEXT: LBB16_47: ## %cond.store45 ; AVX1-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX1-NEXT: LBB16_48: ## %else46 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX1-NEXT: je LBB16_50 -; AVX1-NEXT: ## %bb.49: ## %cond.store47 +; AVX1-NEXT: LBB16_49: ## %cond.store47 ; AVX1-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX1-NEXT: LBB16_50: ## %else48 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX1-NEXT: je LBB16_52 -; AVX1-NEXT: ## %bb.51: ## %cond.store49 +; AVX1-NEXT: LBB16_51: ## %cond.store49 ; AVX1-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX1-NEXT: LBB16_52: ## %else50 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX1-NEXT: je LBB16_54 -; AVX1-NEXT: ## %bb.53: ## %cond.store51 +; AVX1-NEXT: LBB16_53: ## %cond.store51 ; AVX1-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX1-NEXT: LBB16_54: ## %else52 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX1-NEXT: je LBB16_56 -; AVX1-NEXT: ## %bb.55: ## %cond.store53 +; AVX1-NEXT: LBB16_55: ## %cond.store53 ; AVX1-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX1-NEXT: LBB16_56: ## %else54 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: je LBB16_58 -; AVX1-NEXT: ## %bb.57: ## %cond.store55 +; AVX1-NEXT: LBB16_57: ## %cond.store55 ; AVX1-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX1-NEXT: LBB16_58: ## %else56 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX1-NEXT: je LBB16_60 -; AVX1-NEXT: ## %bb.59: ## %cond.store57 +; AVX1-NEXT: LBB16_59: ## %cond.store57 ; AVX1-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX1-NEXT: LBB16_60: ## %else58 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX1-NEXT: je LBB16_62 -; AVX1-NEXT: ## %bb.61: ## %cond.store59 +; AVX1-NEXT: LBB16_61: ## %cond.store59 ; AVX1-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX1-NEXT: LBB16_62: ## %else60 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX1-NEXT: je LBB16_64 -; AVX1-NEXT: ## %bb.63: ## %cond.store61 +; AVX1-NEXT: LBB16_63: ## %cond.store61 ; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX1-NEXT: LBB16_64: ## %else62 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_v32i8_v32i8: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB16_2 -; AVX2-NEXT: ## %bb.1: ## %cond.store +; AVX2-NEXT: jne LBB16_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB16_3 +; AVX2-NEXT: LBB16_4: ## %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB16_5 +; AVX2-NEXT: LBB16_6: ## %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB16_7 +; AVX2-NEXT: LBB16_8: ## %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne LBB16_9 +; AVX2-NEXT: LBB16_10: ## %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne LBB16_11 +; AVX2-NEXT: LBB16_12: ## %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne LBB16_13 +; AVX2-NEXT: LBB16_14: ## %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne LBB16_15 +; AVX2-NEXT: LBB16_16: ## %else14 +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 +; AVX2-NEXT: jne LBB16_17 +; AVX2-NEXT: LBB16_18: ## %else16 +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 +; AVX2-NEXT: jne LBB16_19 +; AVX2-NEXT: LBB16_20: ## %else18 +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX2-NEXT: jne LBB16_21 +; AVX2-NEXT: LBB16_22: ## %else20 +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX2-NEXT: jne LBB16_23 +; AVX2-NEXT: LBB16_24: ## %else22 +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX2-NEXT: jne LBB16_25 +; AVX2-NEXT: LBB16_26: ## %else24 +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX2-NEXT: jne LBB16_27 +; AVX2-NEXT: LBB16_28: ## %else26 +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX2-NEXT: jne LBB16_29 +; AVX2-NEXT: LBB16_30: ## %else28 +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: je LBB16_32 +; AVX2-NEXT: LBB16_31: ## %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX2-NEXT: LBB16_32: ## %else30 +; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: jne LBB16_33 +; AVX2-NEXT: ## %bb.34: ## %else32 +; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX2-NEXT: jne LBB16_35 +; AVX2-NEXT: LBB16_36: ## %else34 +; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX2-NEXT: jne LBB16_37 +; AVX2-NEXT: LBB16_38: ## %else36 +; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX2-NEXT: jne LBB16_39 +; AVX2-NEXT: LBB16_40: ## %else38 +; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX2-NEXT: jne LBB16_41 +; AVX2-NEXT: LBB16_42: ## %else40 +; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX2-NEXT: jne LBB16_43 +; AVX2-NEXT: LBB16_44: ## %else42 +; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX2-NEXT: jne LBB16_45 +; AVX2-NEXT: LBB16_46: ## %else44 +; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX2-NEXT: jne LBB16_47 +; AVX2-NEXT: LBB16_48: ## %else46 +; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX2-NEXT: jne LBB16_49 +; AVX2-NEXT: LBB16_50: ## %else48 +; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX2-NEXT: jne LBB16_51 +; AVX2-NEXT: LBB16_52: ## %else50 +; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX2-NEXT: jne LBB16_53 +; AVX2-NEXT: LBB16_54: ## %else52 +; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX2-NEXT: jne LBB16_55 +; AVX2-NEXT: LBB16_56: ## %else54 +; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX2-NEXT: jne LBB16_57 +; AVX2-NEXT: LBB16_58: ## %else56 +; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX2-NEXT: jne LBB16_59 +; AVX2-NEXT: LBB16_60: ## %else58 +; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX2-NEXT: jne LBB16_61 +; AVX2-NEXT: LBB16_62: ## %else60 +; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX2-NEXT: jne LBB16_63 +; AVX2-NEXT: LBB16_64: ## %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: LBB16_1: ## %cond.store ; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX2-NEXT: LBB16_2: ## %else -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je LBB16_4 -; AVX2-NEXT: ## %bb.3: ## %cond.store1 +; AVX2-NEXT: LBB16_3: ## %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX2-NEXT: LBB16_4: ## %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB16_6 -; AVX2-NEXT: ## %bb.5: ## %cond.store3 +; AVX2-NEXT: LBB16_5: ## %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX2-NEXT: LBB16_6: ## %else4 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB16_8 -; AVX2-NEXT: ## %bb.7: ## %cond.store5 +; AVX2-NEXT: LBB16_7: ## %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX2-NEXT: LBB16_8: ## %else6 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB16_10 -; AVX2-NEXT: ## %bb.9: ## %cond.store7 +; AVX2-NEXT: LBB16_9: ## %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX2-NEXT: LBB16_10: ## %else8 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB16_12 -; AVX2-NEXT: ## %bb.11: ## %cond.store9 +; AVX2-NEXT: LBB16_11: ## %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX2-NEXT: LBB16_12: ## %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB16_14 -; AVX2-NEXT: ## %bb.13: ## %cond.store11 +; AVX2-NEXT: LBB16_13: ## %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX2-NEXT: LBB16_14: ## %else12 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB16_16 -; AVX2-NEXT: ## %bb.15: ## %cond.store13 +; AVX2-NEXT: LBB16_15: ## %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX2-NEXT: LBB16_16: ## %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB16_18 -; AVX2-NEXT: ## %bb.17: ## %cond.store15 +; AVX2-NEXT: LBB16_17: ## %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX2-NEXT: LBB16_18: ## %else16 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB16_20 -; AVX2-NEXT: ## %bb.19: ## %cond.store17 +; AVX2-NEXT: LBB16_19: ## %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX2-NEXT: LBB16_20: ## %else18 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB16_22 -; AVX2-NEXT: ## %bb.21: ## %cond.store19 +; AVX2-NEXT: LBB16_21: ## %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX2-NEXT: LBB16_22: ## %else20 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB16_24 -; AVX2-NEXT: ## %bb.23: ## %cond.store21 +; AVX2-NEXT: LBB16_23: ## %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX2-NEXT: LBB16_24: ## %else22 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB16_26 -; AVX2-NEXT: ## %bb.25: ## %cond.store23 +; AVX2-NEXT: LBB16_25: ## %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX2-NEXT: LBB16_26: ## %else24 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB16_28 -; AVX2-NEXT: ## %bb.27: ## %cond.store25 +; AVX2-NEXT: LBB16_27: ## %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX2-NEXT: LBB16_28: ## %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB16_30 -; AVX2-NEXT: ## %bb.29: ## %cond.store27 +; AVX2-NEXT: LBB16_29: ## %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX2-NEXT: LBB16_30: ## %else28 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm2, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je LBB16_32 -; AVX2-NEXT: ## %bb.31: ## %cond.store29 -; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX2-NEXT: LBB16_32: ## %else30 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: je LBB16_34 -; AVX2-NEXT: ## %bb.33: ## %cond.store31 +; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX2-NEXT: jne LBB16_31 +; AVX2-NEXT: jmp LBB16_32 +; AVX2-NEXT: LBB16_33: ## %cond.store31 ; AVX2-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX2-NEXT: LBB16_34: ## %else32 -; AVX2-NEXT: vpextrb $1, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX2-NEXT: je LBB16_36 -; AVX2-NEXT: ## %bb.35: ## %cond.store33 +; AVX2-NEXT: LBB16_35: ## %cond.store33 ; AVX2-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX2-NEXT: LBB16_36: ## %else34 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX2-NEXT: je LBB16_38 -; AVX2-NEXT: ## %bb.37: ## %cond.store35 +; AVX2-NEXT: LBB16_37: ## %cond.store35 ; AVX2-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX2-NEXT: LBB16_38: ## %else36 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX2-NEXT: je LBB16_40 -; AVX2-NEXT: ## %bb.39: ## %cond.store37 +; AVX2-NEXT: LBB16_39: ## %cond.store37 ; AVX2-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX2-NEXT: LBB16_40: ## %else38 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB16_42 -; AVX2-NEXT: ## %bb.41: ## %cond.store39 +; AVX2-NEXT: LBB16_41: ## %cond.store39 ; AVX2-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX2-NEXT: LBB16_42: ## %else40 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB16_44 -; AVX2-NEXT: ## %bb.43: ## %cond.store41 +; AVX2-NEXT: LBB16_43: ## %cond.store41 ; AVX2-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX2-NEXT: LBB16_44: ## %else42 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB16_46 -; AVX2-NEXT: ## %bb.45: ## %cond.store43 +; AVX2-NEXT: LBB16_45: ## %cond.store43 ; AVX2-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX2-NEXT: LBB16_46: ## %else44 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB16_48 -; AVX2-NEXT: ## %bb.47: ## %cond.store45 +; AVX2-NEXT: LBB16_47: ## %cond.store45 ; AVX2-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX2-NEXT: LBB16_48: ## %else46 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB16_50 -; AVX2-NEXT: ## %bb.49: ## %cond.store47 +; AVX2-NEXT: LBB16_49: ## %cond.store47 ; AVX2-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX2-NEXT: LBB16_50: ## %else48 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX2-NEXT: je LBB16_52 -; AVX2-NEXT: ## %bb.51: ## %cond.store49 +; AVX2-NEXT: LBB16_51: ## %cond.store49 ; AVX2-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX2-NEXT: LBB16_52: ## %else50 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX2-NEXT: je LBB16_54 -; AVX2-NEXT: ## %bb.53: ## %cond.store51 +; AVX2-NEXT: LBB16_53: ## %cond.store51 ; AVX2-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX2-NEXT: LBB16_54: ## %else52 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX2-NEXT: je LBB16_56 -; AVX2-NEXT: ## %bb.55: ## %cond.store53 +; AVX2-NEXT: LBB16_55: ## %cond.store53 ; AVX2-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX2-NEXT: LBB16_56: ## %else54 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB16_58 -; AVX2-NEXT: ## %bb.57: ## %cond.store55 +; AVX2-NEXT: LBB16_57: ## %cond.store55 ; AVX2-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX2-NEXT: LBB16_58: ## %else56 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB16_60 -; AVX2-NEXT: ## %bb.59: ## %cond.store57 +; AVX2-NEXT: LBB16_59: ## %cond.store57 ; AVX2-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX2-NEXT: LBB16_60: ## %else58 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB16_62 -; AVX2-NEXT: ## %bb.61: ## %cond.store59 +; AVX2-NEXT: LBB16_61: ## %cond.store59 ; AVX2-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX2-NEXT: LBB16_62: ## %else60 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB16_64 -; AVX2-NEXT: ## %bb.63: ## %cond.store61 +; AVX2-NEXT: LBB16_63: ## %cond.store61 ; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX2-NEXT: LBB16_64: ## %else62 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: store_v32i8_v32i8: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB16_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.store +; AVX512F-NEXT: jne LBB16_1 +; AVX512F-NEXT: ## %bb.2: ## %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne LBB16_3 +; AVX512F-NEXT: LBB16_4: ## %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne LBB16_5 +; AVX512F-NEXT: LBB16_6: ## %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne LBB16_7 +; AVX512F-NEXT: LBB16_8: ## %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne LBB16_9 +; AVX512F-NEXT: LBB16_10: ## %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne LBB16_11 +; AVX512F-NEXT: LBB16_12: ## %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne LBB16_13 +; AVX512F-NEXT: LBB16_14: ## %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne LBB16_15 +; AVX512F-NEXT: LBB16_16: ## %else14 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512F-NEXT: jne LBB16_17 +; AVX512F-NEXT: LBB16_18: ## %else16 +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512F-NEXT: jne LBB16_19 +; AVX512F-NEXT: LBB16_20: ## %else18 +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512F-NEXT: jne LBB16_21 +; AVX512F-NEXT: LBB16_22: ## %else20 +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512F-NEXT: jne LBB16_23 +; AVX512F-NEXT: LBB16_24: ## %else22 +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512F-NEXT: jne LBB16_25 +; AVX512F-NEXT: LBB16_26: ## %else24 +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512F-NEXT: jne LBB16_27 +; AVX512F-NEXT: LBB16_28: ## %else26 +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512F-NEXT: jne LBB16_29 +; AVX512F-NEXT: LBB16_30: ## %else28 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: je LBB16_32 +; AVX512F-NEXT: LBB16_31: ## %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512F-NEXT: LBB16_32: ## %else30 +; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: jne LBB16_33 +; AVX512F-NEXT: ## %bb.34: ## %else32 +; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512F-NEXT: jne LBB16_35 +; AVX512F-NEXT: LBB16_36: ## %else34 +; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512F-NEXT: jne LBB16_37 +; AVX512F-NEXT: LBB16_38: ## %else36 +; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512F-NEXT: jne LBB16_39 +; AVX512F-NEXT: LBB16_40: ## %else38 +; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512F-NEXT: jne LBB16_41 +; AVX512F-NEXT: LBB16_42: ## %else40 +; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512F-NEXT: jne LBB16_43 +; AVX512F-NEXT: LBB16_44: ## %else42 +; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512F-NEXT: jne LBB16_45 +; AVX512F-NEXT: LBB16_46: ## %else44 +; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512F-NEXT: jne LBB16_47 +; AVX512F-NEXT: LBB16_48: ## %else46 +; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512F-NEXT: jne LBB16_49 +; AVX512F-NEXT: LBB16_50: ## %else48 +; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512F-NEXT: jne LBB16_51 +; AVX512F-NEXT: LBB16_52: ## %else50 +; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512F-NEXT: jne LBB16_53 +; AVX512F-NEXT: LBB16_54: ## %else52 +; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512F-NEXT: jne LBB16_55 +; AVX512F-NEXT: LBB16_56: ## %else54 +; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512F-NEXT: jne LBB16_57 +; AVX512F-NEXT: LBB16_58: ## %else56 +; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512F-NEXT: jne LBB16_59 +; AVX512F-NEXT: LBB16_60: ## %else58 +; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX512F-NEXT: jne LBB16_61 +; AVX512F-NEXT: LBB16_62: ## %else60 +; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX512F-NEXT: jne LBB16_63 +; AVX512F-NEXT: LBB16_64: ## %else62 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: LBB16_1: ## %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX512F-NEXT: LBB16_2: ## %else -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je LBB16_4 -; AVX512F-NEXT: ## %bb.3: ## %cond.store1 +; AVX512F-NEXT: LBB16_3: ## %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX512F-NEXT: LBB16_4: ## %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je LBB16_6 -; AVX512F-NEXT: ## %bb.5: ## %cond.store3 +; AVX512F-NEXT: LBB16_5: ## %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX512F-NEXT: LBB16_6: ## %else4 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je LBB16_8 -; AVX512F-NEXT: ## %bb.7: ## %cond.store5 +; AVX512F-NEXT: LBB16_7: ## %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX512F-NEXT: LBB16_8: ## %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je LBB16_10 -; AVX512F-NEXT: ## %bb.9: ## %cond.store7 +; AVX512F-NEXT: LBB16_9: ## %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX512F-NEXT: LBB16_10: ## %else8 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je LBB16_12 -; AVX512F-NEXT: ## %bb.11: ## %cond.store9 +; AVX512F-NEXT: LBB16_11: ## %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX512F-NEXT: LBB16_12: ## %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je LBB16_14 -; AVX512F-NEXT: ## %bb.13: ## %cond.store11 +; AVX512F-NEXT: LBB16_13: ## %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX512F-NEXT: LBB16_14: ## %else12 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je LBB16_16 -; AVX512F-NEXT: ## %bb.15: ## %cond.store13 +; AVX512F-NEXT: LBB16_15: ## %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX512F-NEXT: LBB16_16: ## %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB16_18 -; AVX512F-NEXT: ## %bb.17: ## %cond.store15 +; AVX512F-NEXT: LBB16_17: ## %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX512F-NEXT: LBB16_18: ## %else16 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB16_20 -; AVX512F-NEXT: ## %bb.19: ## %cond.store17 +; AVX512F-NEXT: LBB16_19: ## %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX512F-NEXT: LBB16_20: ## %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB16_22 -; AVX512F-NEXT: ## %bb.21: ## %cond.store19 +; AVX512F-NEXT: LBB16_21: ## %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX512F-NEXT: LBB16_22: ## %else20 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB16_24 -; AVX512F-NEXT: ## %bb.23: ## %cond.store21 +; AVX512F-NEXT: LBB16_23: ## %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX512F-NEXT: LBB16_24: ## %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB16_26 -; AVX512F-NEXT: ## %bb.25: ## %cond.store23 +; AVX512F-NEXT: LBB16_25: ## %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX512F-NEXT: LBB16_26: ## %else24 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB16_28 -; AVX512F-NEXT: ## %bb.27: ## %cond.store25 +; AVX512F-NEXT: LBB16_27: ## %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX512F-NEXT: LBB16_28: ## %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB16_30 -; AVX512F-NEXT: ## %bb.29: ## %cond.store27 +; AVX512F-NEXT: LBB16_29: ## %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX512F-NEXT: LBB16_30: ## %else28 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je LBB16_32 -; AVX512F-NEXT: ## %bb.31: ## %cond.store29 -; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX512F-NEXT: LBB16_32: ## %else30 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: je LBB16_34 -; AVX512F-NEXT: ## %bb.33: ## %cond.store31 +; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512F-NEXT: jne LBB16_31 +; AVX512F-NEXT: jmp LBB16_32 +; AVX512F-NEXT: LBB16_33: ## %cond.store31 ; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: LBB16_34: ## %else32 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX512F-NEXT: je LBB16_36 -; AVX512F-NEXT: ## %bb.35: ## %cond.store33 +; AVX512F-NEXT: LBB16_35: ## %cond.store33 ; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX512F-NEXT: LBB16_36: ## %else34 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX512F-NEXT: je LBB16_38 -; AVX512F-NEXT: ## %bb.37: ## %cond.store35 +; AVX512F-NEXT: LBB16_37: ## %cond.store35 ; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX512F-NEXT: LBB16_38: ## %else36 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX512F-NEXT: je LBB16_40 -; AVX512F-NEXT: ## %bb.39: ## %cond.store37 +; AVX512F-NEXT: LBB16_39: ## %cond.store37 ; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX512F-NEXT: LBB16_40: ## %else38 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX512F-NEXT: je LBB16_42 -; AVX512F-NEXT: ## %bb.41: ## %cond.store39 +; AVX512F-NEXT: LBB16_41: ## %cond.store39 ; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX512F-NEXT: LBB16_42: ## %else40 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX512F-NEXT: je LBB16_44 -; AVX512F-NEXT: ## %bb.43: ## %cond.store41 +; AVX512F-NEXT: LBB16_43: ## %cond.store41 ; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX512F-NEXT: LBB16_44: ## %else42 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX512F-NEXT: je LBB16_46 -; AVX512F-NEXT: ## %bb.45: ## %cond.store43 +; AVX512F-NEXT: LBB16_45: ## %cond.store43 ; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX512F-NEXT: LBB16_46: ## %else44 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX512F-NEXT: je LBB16_48 -; AVX512F-NEXT: ## %bb.47: ## %cond.store45 +; AVX512F-NEXT: LBB16_47: ## %cond.store45 ; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX512F-NEXT: LBB16_48: ## %else46 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX512F-NEXT: je LBB16_50 -; AVX512F-NEXT: ## %bb.49: ## %cond.store47 +; AVX512F-NEXT: LBB16_49: ## %cond.store47 ; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX512F-NEXT: LBB16_50: ## %else48 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX512F-NEXT: je LBB16_52 -; AVX512F-NEXT: ## %bb.51: ## %cond.store49 +; AVX512F-NEXT: LBB16_51: ## %cond.store49 ; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX512F-NEXT: LBB16_52: ## %else50 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX512F-NEXT: je LBB16_54 -; AVX512F-NEXT: ## %bb.53: ## %cond.store51 +; AVX512F-NEXT: LBB16_53: ## %cond.store51 ; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX512F-NEXT: LBB16_54: ## %else52 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX512F-NEXT: je LBB16_56 -; AVX512F-NEXT: ## %bb.55: ## %cond.store53 +; AVX512F-NEXT: LBB16_55: ## %cond.store53 ; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX512F-NEXT: LBB16_56: ## %else54 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX512F-NEXT: je LBB16_58 -; AVX512F-NEXT: ## %bb.57: ## %cond.store55 +; AVX512F-NEXT: LBB16_57: ## %cond.store55 ; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX512F-NEXT: LBB16_58: ## %else56 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX512F-NEXT: je LBB16_60 -; AVX512F-NEXT: ## %bb.59: ## %cond.store57 +; AVX512F-NEXT: LBB16_59: ## %cond.store57 ; AVX512F-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX512F-NEXT: LBB16_60: ## %else58 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512F-NEXT: je LBB16_62 -; AVX512F-NEXT: ## %bb.61: ## %cond.store59 +; AVX512F-NEXT: LBB16_61: ## %cond.store59 ; AVX512F-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX512F-NEXT: LBB16_62: ## %else60 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512F-NEXT: je LBB16_64 -; AVX512F-NEXT: ## %bb.63: ## %cond.store61 +; AVX512F-NEXT: LBB16_63: ## %cond.store61 ; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX512F-NEXT: LBB16_64: ## %else62 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: store_v32i8_v32i8: ; AVX512VLDQ: ## %bb.0: ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB16_2 -; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: jne LBB16_1 +; AVX512VLDQ-NEXT: ## %bb.2: ## %else +; AVX512VLDQ-NEXT: testb $2, %al +; AVX512VLDQ-NEXT: jne LBB16_3 +; AVX512VLDQ-NEXT: LBB16_4: ## %else2 +; AVX512VLDQ-NEXT: testb $4, %al +; AVX512VLDQ-NEXT: jne LBB16_5 +; AVX512VLDQ-NEXT: LBB16_6: ## %else4 +; AVX512VLDQ-NEXT: testb $8, %al +; AVX512VLDQ-NEXT: jne LBB16_7 +; AVX512VLDQ-NEXT: LBB16_8: ## %else6 +; AVX512VLDQ-NEXT: testb $16, %al +; AVX512VLDQ-NEXT: jne LBB16_9 +; AVX512VLDQ-NEXT: LBB16_10: ## %else8 +; AVX512VLDQ-NEXT: testb $32, %al +; AVX512VLDQ-NEXT: jne LBB16_11 +; AVX512VLDQ-NEXT: LBB16_12: ## %else10 +; AVX512VLDQ-NEXT: testb $64, %al +; AVX512VLDQ-NEXT: jne LBB16_13 +; AVX512VLDQ-NEXT: LBB16_14: ## %else12 +; AVX512VLDQ-NEXT: testb $-128, %al +; AVX512VLDQ-NEXT: jne LBB16_15 +; AVX512VLDQ-NEXT: LBB16_16: ## %else14 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 +; AVX512VLDQ-NEXT: jne LBB16_17 +; AVX512VLDQ-NEXT: LBB16_18: ## %else16 +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 +; AVX512VLDQ-NEXT: jne LBB16_19 +; AVX512VLDQ-NEXT: LBB16_20: ## %else18 +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 +; AVX512VLDQ-NEXT: jne LBB16_21 +; AVX512VLDQ-NEXT: LBB16_22: ## %else20 +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 +; AVX512VLDQ-NEXT: jne LBB16_23 +; AVX512VLDQ-NEXT: LBB16_24: ## %else22 +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 +; AVX512VLDQ-NEXT: jne LBB16_25 +; AVX512VLDQ-NEXT: LBB16_26: ## %else24 +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 +; AVX512VLDQ-NEXT: jne LBB16_27 +; AVX512VLDQ-NEXT: LBB16_28: ## %else26 +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 +; AVX512VLDQ-NEXT: jne LBB16_29 +; AVX512VLDQ-NEXT: LBB16_30: ## %else28 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: je LBB16_32 +; AVX512VLDQ-NEXT: LBB16_31: ## %cond.store29 +; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512VLDQ-NEXT: LBB16_32: ## %else30 +; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: jne LBB16_33 +; AVX512VLDQ-NEXT: ## %bb.34: ## %else32 +; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 +; AVX512VLDQ-NEXT: jne LBB16_35 +; AVX512VLDQ-NEXT: LBB16_36: ## %else34 +; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 +; AVX512VLDQ-NEXT: jne LBB16_37 +; AVX512VLDQ-NEXT: LBB16_38: ## %else36 +; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 +; AVX512VLDQ-NEXT: jne LBB16_39 +; AVX512VLDQ-NEXT: LBB16_40: ## %else38 +; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 +; AVX512VLDQ-NEXT: jne LBB16_41 +; AVX512VLDQ-NEXT: LBB16_42: ## %else40 +; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 +; AVX512VLDQ-NEXT: jne LBB16_43 +; AVX512VLDQ-NEXT: LBB16_44: ## %else42 +; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 +; AVX512VLDQ-NEXT: jne LBB16_45 +; AVX512VLDQ-NEXT: LBB16_46: ## %else44 +; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 +; AVX512VLDQ-NEXT: jne LBB16_47 +; AVX512VLDQ-NEXT: LBB16_48: ## %else46 +; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 +; AVX512VLDQ-NEXT: jne LBB16_49 +; AVX512VLDQ-NEXT: LBB16_50: ## %else48 +; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 +; AVX512VLDQ-NEXT: jne LBB16_51 +; AVX512VLDQ-NEXT: LBB16_52: ## %else50 +; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 +; AVX512VLDQ-NEXT: jne LBB16_53 +; AVX512VLDQ-NEXT: LBB16_54: ## %else52 +; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 +; AVX512VLDQ-NEXT: jne LBB16_55 +; AVX512VLDQ-NEXT: LBB16_56: ## %else54 +; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 +; AVX512VLDQ-NEXT: jne LBB16_57 +; AVX512VLDQ-NEXT: LBB16_58: ## %else56 +; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 +; AVX512VLDQ-NEXT: jne LBB16_59 +; AVX512VLDQ-NEXT: LBB16_60: ## %else58 +; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 +; AVX512VLDQ-NEXT: jne LBB16_61 +; AVX512VLDQ-NEXT: LBB16_62: ## %else60 +; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 +; AVX512VLDQ-NEXT: jne LBB16_63 +; AVX512VLDQ-NEXT: LBB16_64: ## %else62 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; AVX512VLDQ-NEXT: LBB16_1: ## %cond.store ; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) -; AVX512VLDQ-NEXT: LBB16_2: ## %else -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $2, %al ; AVX512VLDQ-NEXT: je LBB16_4 -; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: LBB16_3: ## %cond.store1 ; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) -; AVX512VLDQ-NEXT: LBB16_4: ## %else2 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $4, %al ; AVX512VLDQ-NEXT: je LBB16_6 -; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: LBB16_5: ## %cond.store3 ; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) -; AVX512VLDQ-NEXT: LBB16_6: ## %else4 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $8, %al ; AVX512VLDQ-NEXT: je LBB16_8 -; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: LBB16_7: ## %cond.store5 ; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) -; AVX512VLDQ-NEXT: LBB16_8: ## %else6 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $16, %al ; AVX512VLDQ-NEXT: je LBB16_10 -; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: LBB16_9: ## %cond.store7 ; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) -; AVX512VLDQ-NEXT: LBB16_10: ## %else8 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $32, %al ; AVX512VLDQ-NEXT: je LBB16_12 -; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: LBB16_11: ## %cond.store9 ; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) -; AVX512VLDQ-NEXT: LBB16_12: ## %else10 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $64, %al ; AVX512VLDQ-NEXT: je LBB16_14 -; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: LBB16_13: ## %cond.store11 ; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX512VLDQ-NEXT: LBB16_14: ## %else12 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testb $-128, %al ; AVX512VLDQ-NEXT: je LBB16_16 -; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: LBB16_15: ## %cond.store13 ; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) -; AVX512VLDQ-NEXT: LBB16_16: ## %else14 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB16_18 -; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 +; AVX512VLDQ-NEXT: LBB16_17: ## %cond.store15 ; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) -; AVX512VLDQ-NEXT: LBB16_18: ## %else16 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB16_20 -; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 +; AVX512VLDQ-NEXT: LBB16_19: ## %cond.store17 ; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) -; AVX512VLDQ-NEXT: LBB16_20: ## %else18 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB16_22 -; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 +; AVX512VLDQ-NEXT: LBB16_21: ## %cond.store19 ; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) -; AVX512VLDQ-NEXT: LBB16_22: ## %else20 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB16_24 -; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 +; AVX512VLDQ-NEXT: LBB16_23: ## %cond.store21 ; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) -; AVX512VLDQ-NEXT: LBB16_24: ## %else22 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB16_26 -; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 +; AVX512VLDQ-NEXT: LBB16_25: ## %cond.store23 ; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) -; AVX512VLDQ-NEXT: LBB16_26: ## %else24 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB16_28 -; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 +; AVX512VLDQ-NEXT: LBB16_27: ## %cond.store25 ; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) -; AVX512VLDQ-NEXT: LBB16_28: ## %else26 -; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB16_30 -; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 +; AVX512VLDQ-NEXT: LBB16_29: ## %cond.store27 ; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) -; AVX512VLDQ-NEXT: LBB16_30: ## %else28 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: je LBB16_32 -; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 -; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) -; AVX512VLDQ-NEXT: LBB16_32: ## %else30 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLDQ-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: je LBB16_34 -; AVX512VLDQ-NEXT: ## %bb.33: ## %cond.store31 +; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 +; AVX512VLDQ-NEXT: jne LBB16_31 +; AVX512VLDQ-NEXT: jmp LBB16_32 +; AVX512VLDQ-NEXT: LBB16_33: ## %cond.store31 ; AVX512VLDQ-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX512VLDQ-NEXT: LBB16_34: ## %else32 -; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX512VLDQ-NEXT: je LBB16_36 -; AVX512VLDQ-NEXT: ## %bb.35: ## %cond.store33 +; AVX512VLDQ-NEXT: LBB16_35: ## %cond.store33 ; AVX512VLDQ-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX512VLDQ-NEXT: LBB16_36: ## %else34 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX512VLDQ-NEXT: je LBB16_38 -; AVX512VLDQ-NEXT: ## %bb.37: ## %cond.store35 +; AVX512VLDQ-NEXT: LBB16_37: ## %cond.store35 ; AVX512VLDQ-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX512VLDQ-NEXT: LBB16_38: ## %else36 -; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX512VLDQ-NEXT: je LBB16_40 -; AVX512VLDQ-NEXT: ## %bb.39: ## %cond.store37 +; AVX512VLDQ-NEXT: LBB16_39: ## %cond.store37 ; AVX512VLDQ-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX512VLDQ-NEXT: LBB16_40: ## %else38 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX512VLDQ-NEXT: je LBB16_42 -; AVX512VLDQ-NEXT: ## %bb.41: ## %cond.store39 +; AVX512VLDQ-NEXT: LBB16_41: ## %cond.store39 ; AVX512VLDQ-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX512VLDQ-NEXT: LBB16_42: ## %else40 -; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX512VLDQ-NEXT: je LBB16_44 -; AVX512VLDQ-NEXT: ## %bb.43: ## %cond.store41 +; AVX512VLDQ-NEXT: LBB16_43: ## %cond.store41 ; AVX512VLDQ-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX512VLDQ-NEXT: LBB16_44: ## %else42 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX512VLDQ-NEXT: je LBB16_46 -; AVX512VLDQ-NEXT: ## %bb.45: ## %cond.store43 +; AVX512VLDQ-NEXT: LBB16_45: ## %cond.store43 ; AVX512VLDQ-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX512VLDQ-NEXT: LBB16_46: ## %else44 -; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX512VLDQ-NEXT: je LBB16_48 -; AVX512VLDQ-NEXT: ## %bb.47: ## %cond.store45 +; AVX512VLDQ-NEXT: LBB16_47: ## %cond.store45 ; AVX512VLDQ-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX512VLDQ-NEXT: LBB16_48: ## %else46 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX512VLDQ-NEXT: je LBB16_50 -; AVX512VLDQ-NEXT: ## %bb.49: ## %cond.store47 +; AVX512VLDQ-NEXT: LBB16_49: ## %cond.store47 ; AVX512VLDQ-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX512VLDQ-NEXT: LBB16_50: ## %else48 -; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX512VLDQ-NEXT: je LBB16_52 -; AVX512VLDQ-NEXT: ## %bb.51: ## %cond.store49 +; AVX512VLDQ-NEXT: LBB16_51: ## %cond.store49 ; AVX512VLDQ-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX512VLDQ-NEXT: LBB16_52: ## %else50 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX512VLDQ-NEXT: je LBB16_54 -; AVX512VLDQ-NEXT: ## %bb.53: ## %cond.store51 +; AVX512VLDQ-NEXT: LBB16_53: ## %cond.store51 ; AVX512VLDQ-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX512VLDQ-NEXT: LBB16_54: ## %else52 -; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX512VLDQ-NEXT: je LBB16_56 -; AVX512VLDQ-NEXT: ## %bb.55: ## %cond.store53 +; AVX512VLDQ-NEXT: LBB16_55: ## %cond.store53 ; AVX512VLDQ-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX512VLDQ-NEXT: LBB16_56: ## %else54 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX512VLDQ-NEXT: je LBB16_58 -; AVX512VLDQ-NEXT: ## %bb.57: ## %cond.store55 +; AVX512VLDQ-NEXT: LBB16_57: ## %cond.store55 ; AVX512VLDQ-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX512VLDQ-NEXT: LBB16_58: ## %else56 -; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX512VLDQ-NEXT: je LBB16_60 -; AVX512VLDQ-NEXT: ## %bb.59: ## %cond.store57 +; AVX512VLDQ-NEXT: LBB16_59: ## %cond.store57 ; AVX512VLDQ-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX512VLDQ-NEXT: LBB16_60: ## %else58 -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %k1, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512VLDQ-NEXT: je LBB16_62 -; AVX512VLDQ-NEXT: ## %bb.61: ## %cond.store59 +; AVX512VLDQ-NEXT: LBB16_61: ## %cond.store59 ; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX512VLDQ-NEXT: LBB16_62: ## %else60 -; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax -; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512VLDQ-NEXT: je LBB16_64 -; AVX512VLDQ-NEXT: ## %bb.63: ## %cond.store61 +; AVX512VLDQ-NEXT: LBB16_63: ## %cond.store61 ; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX512VLDQ-NEXT: LBB16_64: ## %else62 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; @@ -5060,61 +4641,38 @@ ; SimplifyDemandedBits eliminates an ashr here. define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) { -; SSE2-LABEL: masked_store_bool_mask_demand_trunc_sext: -; SSE2: ## %bb.0: -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB23_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlps %xmm0, (%rdi) -; SSE2-NEXT: LBB23_2: ## %else -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB23_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhps %xmm0, 8(%rdi) -; SSE2-NEXT: LBB23_4: ## %else2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB23_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movlps %xmm1, 16(%rdi) -; SSE2-NEXT: LBB23_6: ## %else4 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB23_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: movhps %xmm1, 24(%rdi) -; SSE2-NEXT: LBB23_8: ## %else6 -; SSE2-NEXT: retq -; -; SSE4-LABEL: masked_store_bool_mask_demand_trunc_sext: -; SSE4: ## %bb.0: -; SSE4-NEXT: pextrb $0, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB23_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlps %xmm0, (%rdi) -; SSE4-NEXT: LBB23_2: ## %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB23_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhps %xmm0, 8(%rdi) -; SSE4-NEXT: LBB23_4: ## %else2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB23_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 -; SSE4-NEXT: movlps %xmm1, 16(%rdi) -; SSE4-NEXT: LBB23_6: ## %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB23_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 -; SSE4-NEXT: movhps %xmm1, 24(%rdi) -; SSE4-NEXT: LBB23_8: ## %else6 -; SSE4-NEXT: retq +; SSE-LABEL: masked_store_bool_mask_demand_trunc_sext: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB23_1 +; SSE-NEXT: ## %bb.2: ## %else +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB23_3 +; SSE-NEXT: LBB23_4: ## %else2 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB23_5 +; SSE-NEXT: LBB23_6: ## %else4 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB23_7 +; SSE-NEXT: LBB23_8: ## %else6 +; SSE-NEXT: retq +; SSE-NEXT: LBB23_1: ## %cond.store +; SSE-NEXT: movlps %xmm0, (%rdi) +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB23_4 +; SSE-NEXT: LBB23_3: ## %cond.store1 +; SSE-NEXT: movhps %xmm0, 8(%rdi) +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB23_6 +; SSE-NEXT: LBB23_5: ## %cond.store3 +; SSE-NEXT: movlps %xmm1, 16(%rdi) +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB23_8 +; SSE-NEXT: LBB23_7: ## %cond.store5 +; SSE-NEXT: movhps %xmm1, 24(%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext: ; AVX1: ## %bb.0: @@ -5172,85 +4730,71 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4 x i32> %mask) { ; SSE2-LABEL: one_mask_bit_set1_variable: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movmskps %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je LBB24_2 -; SSE2-NEXT: ## %bb.1: ## %cond.store +; SSE2-NEXT: jne LBB24_1 +; SSE2-NEXT: ## %bb.2: ## %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne LBB24_3 +; SSE2-NEXT: LBB24_4: ## %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne LBB24_5 +; SSE2-NEXT: LBB24_6: ## %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne LBB24_7 +; SSE2-NEXT: LBB24_8: ## %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: LBB24_1: ## %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: LBB24_2: ## %else -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB24_4 -; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; SSE2-NEXT: movss %xmm3, 4(%rdi) -; SSE2-NEXT: LBB24_4: ## %else2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB24_3: ## %cond.store1 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE2-NEXT: movss %xmm1, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB24_6 -; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE2-NEXT: movss %xmm2, 8(%rdi) -; SSE2-NEXT: LBB24_6: ## %else4 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: LBB24_5: ## %cond.store3 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movss %xmm1, 8(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB24_8 -; SSE2-NEXT: ## %bb.7: ## %cond.store5 +; SSE2-NEXT: LBB24_7: ## %cond.store5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: LBB24_8: ## %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: one_mask_bit_set1_variable: ; SSE4: ## %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE4-NEXT: pand %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: movmskps %xmm1, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je LBB24_2 -; SSE4-NEXT: ## %bb.1: ## %cond.store +; SSE4-NEXT: jne LBB24_1 +; SSE4-NEXT: ## %bb.2: ## %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne LBB24_3 +; SSE4-NEXT: LBB24_4: ## %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne LBB24_5 +; SSE4-NEXT: LBB24_6: ## %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne LBB24_7 +; SSE4-NEXT: LBB24_8: ## %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: LBB24_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: LBB24_2: ## %else -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je LBB24_4 -; SSE4-NEXT: ## %bb.3: ## %cond.store1 +; SSE4-NEXT: LBB24_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) -; SSE4-NEXT: LBB24_4: ## %else2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je LBB24_6 -; SSE4-NEXT: ## %bb.5: ## %cond.store3 +; SSE4-NEXT: LBB24_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) -; SSE4-NEXT: LBB24_6: ## %else4 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je LBB24_8 -; SSE4-NEXT: ## %bb.7: ## %cond.store5 +; SSE4-NEXT: LBB24_7: ## %cond.store5 ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) -; SSE4-NEXT: LBB24_8: ## %else6 ; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: one_mask_bit_set1_variable: @@ -5286,24 +4830,31 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; SSE2-LABEL: widen_masked_store: ; SSE2: ## %bb.0: -; SSE2-NEXT: testb $1, %sil +; SSE2-NEXT: andb $1, %sil +; SSE2-NEXT: andb $1, %dl +; SSE2-NEXT: addb %dl, %dl +; SSE2-NEXT: orb %sil, %dl +; SSE2-NEXT: andb $1, %cl +; SSE2-NEXT: shlb $2, %cl +; SSE2-NEXT: orb %dl, %cl +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB25_1 ; SSE2-NEXT: ## %bb.2: ## %else -; SSE2-NEXT: testb $1, %dl +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: jne LBB25_3 ; SSE2-NEXT: LBB25_4: ## %else2 -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: jne LBB25_5 ; SSE2-NEXT: LBB25_6: ## %else4 ; SSE2-NEXT: retq ; SSE2-NEXT: LBB25_1: ## %cond.store ; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: testb $1, %dl +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: je LBB25_4 ; SSE2-NEXT: LBB25_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: movd %xmm1, 4(%rdi) -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: je LBB25_6 ; SSE2-NEXT: LBB25_5: ## %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] @@ -5312,23 +4863,30 @@ ; ; SSE4-LABEL: widen_masked_store: ; SSE4: ## %bb.0: -; SSE4-NEXT: testb $1, %sil +; SSE4-NEXT: andb $1, %sil +; SSE4-NEXT: andb $1, %dl +; SSE4-NEXT: addb %dl, %dl +; SSE4-NEXT: orb %sil, %dl +; SSE4-NEXT: andb $1, %cl +; SSE4-NEXT: shlb $2, %cl +; SSE4-NEXT: orb %dl, %cl +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne LBB25_1 ; SSE4-NEXT: ## %bb.2: ## %else -; SSE4-NEXT: testb $1, %dl +; SSE4-NEXT: testb $2, %cl ; SSE4-NEXT: jne LBB25_3 ; SSE4-NEXT: LBB25_4: ## %else2 -; SSE4-NEXT: testb $1, %cl +; SSE4-NEXT: testb $4, %cl ; SSE4-NEXT: jne LBB25_5 ; SSE4-NEXT: LBB25_6: ## %else4 ; SSE4-NEXT: retq ; SSE4-NEXT: LBB25_1: ## %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: testb $1, %dl +; SSE4-NEXT: testb $2, %cl ; SSE4-NEXT: je LBB25_4 ; SSE4-NEXT: LBB25_3: ## %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) -; SSE4-NEXT: testb $1, %cl +; SSE4-NEXT: testb $4, %cl ; SSE4-NEXT: je LBB25_6 ; SSE4-NEXT: LBB25_5: ## %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) Index: llvm/trunk/test/CodeGen/X86/masked_store_trunc.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_store_trunc.ll +++ llvm/trunk/test/CodeGen/X86/masked_store_trunc.ll @@ -11,152 +11,146 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: packssdw %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: movd %xmm7, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: psrlq $16, %xmm6 -; SSE2-NEXT: movd %xmm6, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm1, 4(%rdi) -; SSE2-NEXT: .LBB0_4: # %else2 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm4, 8(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB0_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB0_3 +; SSE2-NEXT: .LBB0_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB0_5 ; SSE2-NEXT: .LBB0_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB0_7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB0_9 +; SSE2-NEXT: # %bb.10: # %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB0_11 +; SSE2-NEXT: .LBB0_12: # %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB0_13 +; SSE2-NEXT: .LBB0_14: # %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB0_15 +; SSE2-NEXT: .LBB0_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB0_1: # %cond.store +; SSE2-NEXT: movd %xmm0, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB0_4 +; SSE2-NEXT: .LBB0_3: # %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm1, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB0_6 +; SSE2-NEXT: .LBB0_5: # %cond.store3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm1, 8(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB0_7 +; SSE2-NEXT: jmp .LBB0_8 +; SSE2-NEXT: .LBB0_9: # %cond.store7 ; SSE2-NEXT: movss %xmm2, 16(%rdi) -; SSE2-NEXT: .LBB0_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB0_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 +; SSE2-NEXT: .LBB0_11: # %cond.store9 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 20(%rdi) -; SSE2-NEXT: .LBB0_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm1, 24(%rdi) -; SSE2-NEXT: .LBB0_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB0_13: # %cond.store11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 24(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB0_15: # %cond.store13 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 28(%rdi) -; SSE2-NEXT: .LBB0_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 +; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: .LBB0_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB0_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB0_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB0_3 +; SSE4-NEXT: .LBB0_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB0_5 ; SSE4-NEXT: .LBB0_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB0_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) +; SSE4-NEXT: .LBB0_7: # %cond.store5 +; SSE4-NEXT: pextrd $3, %xmm0, 12(%rdi) ; SSE4-NEXT: .LBB0_8: # %else6 ; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB0_9 +; SSE4-NEXT: # %bb.10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB0_11 +; SSE4-NEXT: .LBB0_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB0_13 +; SSE4-NEXT: .LBB0_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB0_15 +; SSE4-NEXT: .LBB0_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB0_1: # %cond.store +; SSE4-NEXT: movd %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB0_4 +; SSE4-NEXT: .LBB0_3: # %cond.store1 +; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB0_6 +; SSE4-NEXT: .LBB0_5: # %cond.store3 +; SSE4-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB0_7 +; SSE4-NEXT: jmp .LBB0_8 +; SSE4-NEXT: .LBB0_9: # %cond.store7 ; SSE4-NEXT: movss %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB0_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB0_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB0_11: # %cond.store9 ; SSE4-NEXT: extractps $1, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB0_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB0_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB0_13: # %cond.store11 ; SSE4-NEXT: extractps $2, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB0_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB0_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB0_15: # %cond.store13 ; SSE4-NEXT: extractps $3, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB0_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i32: @@ -224,331 +218,311 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: packssdw %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movd %xmm7, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB1_2: # %else -; SSE2-NEXT: psrlq $16, %xmm6 -; SSE2-NEXT: movd %xmm6, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB1_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB1_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB1_3 +; SSE2-NEXT: .LBB1_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB1_5 ; SSE2-NEXT: .LBB1_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB1_7 ; SSE2-NEXT: .LBB1_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB1_9 ; SSE2-NEXT: .LBB1_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB1_11 ; SSE2-NEXT: .LBB1_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB1_13 ; SSE2-NEXT: .LBB1_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB1_15 +; SSE2-NEXT: .LBB1_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB1_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB1_4 +; SSE2-NEXT: .LBB1_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB1_6 +; SSE2-NEXT: .LBB1_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB1_8 +; SSE2-NEXT: .LBB1_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB1_10 +; SSE2-NEXT: .LBB1_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB1_12 +; SSE2-NEXT: .LBB1_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB1_14 +; SSE2-NEXT: .LBB1_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB1_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB1_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB1_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa %xmm4, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 -; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2,3],xmm3[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7] +; SSE4-NEXT: pxor %xmm6, %xmm6 +; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] ; SSE4-NEXT: packusdw %xmm3, %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1,2,3],xmm1[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] ; SSE4-NEXT: packusdw %xmm1, %xmm0 ; SSE4-NEXT: packusdw %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB1_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB1_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB1_3 +; SSE4-NEXT: .LBB1_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB1_5 +; SSE4-NEXT: .LBB1_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB1_7 +; SSE4-NEXT: .LBB1_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB1_9 +; SSE4-NEXT: .LBB1_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB1_11 +; SSE4-NEXT: .LBB1_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB1_13 +; SSE4-NEXT: .LBB1_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB1_15 +; SSE4-NEXT: .LBB1_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB1_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB1_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB1_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB1_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB1_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB1_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB1_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB1_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB1_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB1_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB1_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB1_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB1_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB1_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB1_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB1_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB1_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB1_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB1_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB1_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB1_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB1_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB1_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB1_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB1_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB1_3 +; AVX1-NEXT: .LBB1_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB1_5 +; AVX1-NEXT: .LBB1_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB1_7 +; AVX1-NEXT: .LBB1_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB1_9 +; AVX1-NEXT: .LBB1_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB1_11 +; AVX1-NEXT: .LBB1_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB1_13 +; AVX1-NEXT: .LBB1_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB1_15 +; AVX1-NEXT: .LBB1_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB1_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB1_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB1_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB1_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB1_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB1_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB1_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB1_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB1_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB1_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB1_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB1_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB1_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB1_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB1_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB1_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB1_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB1_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB1_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB1_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB1_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB1_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB1_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB1_3 +; AVX2-NEXT: .LBB1_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB1_5 +; AVX2-NEXT: .LBB1_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB1_7 +; AVX2-NEXT: .LBB1_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB1_9 +; AVX2-NEXT: .LBB1_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB1_11 +; AVX2-NEXT: .LBB1_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB1_13 +; AVX2-NEXT: .LBB1_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB1_15 +; AVX2-NEXT: .LBB1_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB1_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB1_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB1_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB1_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB1_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB1_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB1_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB1_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB1_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB1_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB1_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB1_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB1_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB1_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB1_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB1_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB1_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB1_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB1_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB1_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB1_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -559,66 +533,61 @@ ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB1_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB1_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB1_3 +; AVX512F-NEXT: .LBB1_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB1_5 +; AVX512F-NEXT: .LBB1_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB1_7 +; AVX512F-NEXT: .LBB1_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB1_9 +; AVX512F-NEXT: .LBB1_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB1_11 +; AVX512F-NEXT: .LBB1_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB1_13 +; AVX512F-NEXT: .LBB1_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB1_15 +; AVX512F-NEXT: .LBB1_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB1_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB1_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB1_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB1_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB1_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB1_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB1_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB1_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB1_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB1_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB1_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB1_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB1_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB1_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB1_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB1_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB1_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB1_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB1_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB1_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB1_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB1_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB1_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -645,331 +614,311 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: packssdw %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movd %xmm7, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB2_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB2_2: # %else -; SSE2-NEXT: psrlq $16, %xmm6 -; SSE2-NEXT: movd %xmm6, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB2_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB2_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB2_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB2_3 +; SSE2-NEXT: .LBB2_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB2_5 +; SSE2-NEXT: .LBB2_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB2_7 ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB2_9 ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB2_11 ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB2_15 +; SSE2-NEXT: .LBB2_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB2_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB2_4 +; SSE2-NEXT: .LBB2_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB2_6 +; SSE2-NEXT: .LBB2_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB2_8 +; SSE2-NEXT: .LBB2_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB2_10 +; SSE2-NEXT: .LBB2_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB2_12 +; SSE2-NEXT: .LBB2_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB2_14 +; SSE2-NEXT: .LBB2_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB2_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB2_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa %xmm4, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 -; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2,3],xmm3[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2,3],xmm2[4],xmm8[5,6,7] +; SSE4-NEXT: pxor %xmm6, %xmm6 +; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] ; SSE4-NEXT: packusdw %xmm3, %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1,2,3],xmm1[4],xmm8[5,6,7] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] ; SSE4-NEXT: packusdw %xmm1, %xmm0 ; SSE4-NEXT: packusdw %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB2_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB2_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB2_3 +; SSE4-NEXT: .LBB2_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB2_5 +; SSE4-NEXT: .LBB2_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB2_7 +; SSE4-NEXT: .LBB2_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB2_9 +; SSE4-NEXT: .LBB2_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB2_11 +; SSE4-NEXT: .LBB2_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB2_13 +; SSE4-NEXT: .LBB2_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB2_15 +; SSE4-NEXT: .LBB2_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB2_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB2_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB2_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB2_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB2_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB2_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB2_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB2_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB2_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB2_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB2_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB2_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB2_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB2_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB2_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB2_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB2_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB2_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB2_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB2_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB2_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB2_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB2_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB2_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB2_3 +; AVX1-NEXT: .LBB2_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB2_5 +; AVX1-NEXT: .LBB2_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB2_7 +; AVX1-NEXT: .LBB2_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB2_9 +; AVX1-NEXT: .LBB2_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB2_11 +; AVX1-NEXT: .LBB2_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB2_13 +; AVX1-NEXT: .LBB2_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB2_15 +; AVX1-NEXT: .LBB2_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB2_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB2_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB2_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB2_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB2_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB2_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB2_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB2_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB2_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB2_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB2_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB2_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB2_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB2_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB2_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB2_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB2_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB2_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB2_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB2_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB2_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB2_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB2_3 +; AVX2-NEXT: .LBB2_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB2_5 +; AVX2-NEXT: .LBB2_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB2_7 +; AVX2-NEXT: .LBB2_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB2_9 +; AVX2-NEXT: .LBB2_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB2_11 +; AVX2-NEXT: .LBB2_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB2_13 +; AVX2-NEXT: .LBB2_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB2_15 +; AVX2-NEXT: .LBB2_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB2_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB2_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB2_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB2_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB2_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB2_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB2_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB2_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB2_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB2_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB2_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB2_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB2_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB2_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB2_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB2_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB2_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB2_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB2_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB2_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -980,66 +929,61 @@ ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB2_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB2_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB2_3 +; AVX512F-NEXT: .LBB2_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB2_5 +; AVX512F-NEXT: .LBB2_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB2_7 +; AVX512F-NEXT: .LBB2_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB2_9 +; AVX512F-NEXT: .LBB2_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB2_11 +; AVX512F-NEXT: .LBB2_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB2_13 +; AVX512F-NEXT: .LBB2_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB2_15 +; AVX512F-NEXT: .LBB2_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB2_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB2_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB2_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB2_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB2_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB2_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB2_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB2_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB2_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB2_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB2_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB2_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB2_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB2_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB2_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB2_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB2_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB2_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB2_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB2_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB2_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB2_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1067,80 +1011,76 @@ ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB3_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: jne .LBB3_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB3_3 +; SSE2-NEXT: .LBB3_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB3_5 +; SSE2-NEXT: .LBB3_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB3_7 +; SSE2-NEXT: .LBB3_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB3_1: # %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: .LBB3_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB3_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm3, 4(%rdi) -; SSE2-NEXT: .LBB3_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB3_3: # %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm1, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 +; SSE2-NEXT: .LBB3_5: # %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm1, 8(%rdi) -; SSE2-NEXT: .LBB3_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB3_7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) -; SSE2-NEXT: .LBB3_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm4, %xmm3 +; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB3_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE4-NEXT: movmskps %xmm3, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB3_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB3_3 +; SSE4-NEXT: .LBB3_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB3_5 +; SSE4-NEXT: .LBB3_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB3_7 +; SSE4-NEXT: .LBB3_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB3_1: # %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: .LBB3_2: # %else -; SSE4-NEXT: pextrb $4, %xmm3, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB3_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB3_3: # %cond.store1 ; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB3_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB3_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB3_5: # %cond.store3 ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB3_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB3_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB3_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB3_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i32: @@ -1207,119 +1147,115 @@ ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB4_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB4_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB4_3 ; SSE2-NEXT: .LBB4_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB4_5 ; SSE2-NEXT: .LBB4_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB4_7 +; SSE2-NEXT: .LBB4_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB4_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB4_4 +; SSE2-NEXT: .LBB4_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB4_6 +; SSE2-NEXT: .LBB4_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB4_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB4_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm4, %xmm3 +; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB4_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE4-NEXT: movmskps %xmm3, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB4_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB4_3 +; SSE4-NEXT: .LBB4_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB4_5 +; SSE4-NEXT: .LBB4_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB4_7 +; SSE4-NEXT: .LBB4_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB4_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB4_2: # %else -; SSE4-NEXT: pextrb $4, %xmm3, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB4_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB4_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB4_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB4_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB4_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB4_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB4_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v4i64_v4i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB4_2 -; AVX-NEXT: # %bb.1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB4_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax ; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB4_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB4_3 +; AVX-NEXT: .LBB4_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB4_5 +; AVX-NEXT: .LBB4_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB4_7 +; AVX-NEXT: .LBB4_8: # %else6 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; AVX-NEXT: .LBB4_1: # %cond.store +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB4_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB4_3: # %cond.store1 ; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB4_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB4_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB4_5: # %cond.store3 ; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB4_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB4_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB4_7: # %cond.store5 ; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB4_8: # %else6 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1331,34 +1267,33 @@ ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB4_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB4_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB4_3 +; AVX512F-NEXT: .LBB4_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB4_5 +; AVX512F-NEXT: .LBB4_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB4_7 +; AVX512F-NEXT: .LBB4_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB4_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB4_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB4_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB4_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB4_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB4_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB4_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB4_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB4_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB4_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1391,119 +1326,115 @@ ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB5_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: jne .LBB5_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB5_3 ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB5_5 ; SSE2-NEXT: .LBB5_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB5_7 +; SSE2-NEXT: .LBB5_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB5_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB5_4 +; SSE2-NEXT: .LBB5_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB5_6 +; SSE2-NEXT: .LBB5_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB5_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB5_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm4, %xmm3 +; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB5_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE4-NEXT: movmskps %xmm3, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB5_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB5_3 +; SSE4-NEXT: .LBB5_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB5_5 +; SSE4-NEXT: .LBB5_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB5_7 +; SSE4-NEXT: .LBB5_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB5_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB5_2: # %else -; SSE4-NEXT: pextrb $4, %xmm3, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB5_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB5_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB5_4: # %else2 -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB5_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB5_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB5_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB5_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB5_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v4i64_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB5_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB5_3 +; AVX-NEXT: .LBB5_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB5_5 +; AVX-NEXT: .LBB5_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB5_7 +; AVX-NEXT: .LBB5_8: # %else6 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; AVX-NEXT: .LBB5_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB5_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB5_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB5_3: # %cond.store1 ; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB5_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB5_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB5_5: # %cond.store3 ; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB5_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB5_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB5_7: # %cond.store5 ; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB5_8: # %else6 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1515,34 +1446,33 @@ ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB5_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB5_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB5_3 +; AVX512F-NEXT: .LBB5_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB5_5 +; AVX512F-NEXT: .LBB5_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB5_7 +; AVX512F-NEXT: .LBB5_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB5_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB5_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB5_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB5_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB5_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB5_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB5_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB5_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB5_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB5_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1578,42 +1508,43 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB6_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: jne .LBB6_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB6_3 +; SSE2-NEXT: .LBB6_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB6_1: # %cond.store ; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: .LBB6_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB6_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB6_3: # %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 4(%rdi) -; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax +; SSE4-NEXT: movmskpd %xmm2, %eax +; SSE4-NEXT: xorl $3, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB6_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB6_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB6_3 +; SSE4-NEXT: .LBB6_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB6_1: # %cond.store ; SSE4-NEXT: movss %xmm0, (%rdi) -; SSE4-NEXT: .LBB6_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB6_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB6_3: # %cond.store1 ; SSE4-NEXT: extractps $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v2i64_v2i32: @@ -1678,63 +1609,65 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB7_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB7_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB7_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB7_3 +; SSE2-NEXT: .LBB7_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB7_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB7_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB7_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax +; SSE4-NEXT: movmskpd %xmm2, %eax +; SSE4-NEXT: xorl $3, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB7_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB7_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB7_3 +; SSE4-NEXT: .LBB7_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB7_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB7_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB7_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB7_3: # %cond.store1 ; SSE4-NEXT: pextrw $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB7_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB7_3 +; AVX-NEXT: .LBB7_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB7_1: # %cond.store ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB7_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB7_3: # %cond.store1 ; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: @@ -1743,18 +1676,19 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB7_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB7_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB7_3 +; AVX512F-NEXT: .LBB7_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB7_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB7_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB7_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB7_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1788,63 +1722,65 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB8_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB8_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB8_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB8_3 +; SSE2-NEXT: .LBB8_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB8_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB8_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB8_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax +; SSE4-NEXT: movmskpd %xmm2, %eax +; SSE4-NEXT: xorl $3, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB8_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB8_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB8_3 +; SSE4-NEXT: .LBB8_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB8_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB8_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB8_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB8_3: # %cond.store1 ; SSE4-NEXT: pextrb $8, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB8_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB8_3 +; AVX-NEXT: .LBB8_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB8_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB8_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB8_3: # %cond.store1 ; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: @@ -1853,18 +1789,19 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB8_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB8_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB8_3 +; AVX512F-NEXT: .LBB8_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB8_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB8_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB8_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB8_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1894,671 +1831,557 @@ ; SSE2-LABEL: truncstore_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB9_2: # %else -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB9_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pextrw $4, %xmm9, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) -; SSE2-NEXT: .LBB9_6: # %else4 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB9_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB9_3 +; SSE2-NEXT: .LBB9_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB9_5 +; SSE2-NEXT: .LBB9_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB9_7 ; SSE2-NEXT: .LBB9_8: # %else6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB9_9 ; SSE2-NEXT: .LBB9_10: # %else8 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pextrw $2, %xmm4, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB9_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: .LBB9_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) ; SSE2-NEXT: .LBB9_12: # %else10 ; SSE2-NEXT: pslld $16, %xmm3 ; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB9_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) ; SSE2-NEXT: .LBB9_14: # %else12 ; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB9_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movw %ax, 14(%rdi) +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 14(%rdi) ; SSE2-NEXT: .LBB9_16: # %else14 ; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_18 -; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movw %ax, 16(%rdi) -; SSE2-NEXT: .LBB9_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_20 -; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: movw %ax, 18(%rdi) +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: jne .LBB9_17 +; SSE2-NEXT: # %bb.18: # %else16 +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: jne .LBB9_19 ; SSE2-NEXT: .LBB9_20: # %else18 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_22 -; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movw %ax, 20(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: jne .LBB9_21 ; SSE2-NEXT: .LBB9_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_24 -; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: movw %ax, 22(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: jne .LBB9_23 ; SSE2-NEXT: .LBB9_24: # %else22 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_26 -; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movw %ax, 24(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: jne .LBB9_25 ; SSE2-NEXT: .LBB9_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_28 -; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: movw %ax, 26(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: jne .LBB9_27 ; SSE2-NEXT: .LBB9_28: # %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: movw %ax, 28(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: jne .LBB9_29 ; SSE2-NEXT: .LBB9_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB9_31 +; SSE2-NEXT: .LBB9_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB9_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB9_4 +; SSE2-NEXT: .LBB9_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB9_6 +; SSE2-NEXT: .LBB9_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB9_8 +; SSE2-NEXT: .LBB9_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB9_10 +; SSE2-NEXT: .LBB9_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB9_11 +; SSE2-NEXT: jmp .LBB9_12 +; SSE2-NEXT: .LBB9_17: # %cond.store15 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movw %cx, 16(%rdi) +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: je .LBB9_20 +; SSE2-NEXT: .LBB9_19: # %cond.store17 +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 18(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: je .LBB9_22 +; SSE2-NEXT: .LBB9_21: # %cond.store19 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 20(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: je .LBB9_24 +; SSE2-NEXT: .LBB9_23: # %cond.store21 +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 22(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: je .LBB9_26 +; SSE2-NEXT: .LBB9_25: # %cond.store23 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 24(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: je .LBB9_28 +; SSE2-NEXT: .LBB9_27: # %cond.store25 +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 26(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: je .LBB9_30 +; SSE2-NEXT: .LBB9_29: # %cond.store27 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 28(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB9_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB9_31: # %cond.store29 ; SSE2-NEXT: pextrw $7, %xmm2, %eax ; SSE2-NEXT: movw %ax, 30(%rdi) -; SSE2-NEXT: .LBB9_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa %xmm4, %xmm10 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE4-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE4-NEXT: pxor %xmm10, %xmm9 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7] ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] ; SSE4-NEXT: packusdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB9_2: # %else -; SSE4-NEXT: pextrb $4, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB9_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB9_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB9_3 +; SSE4-NEXT: .LBB9_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB9_5 ; SSE4-NEXT: .LBB9_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB9_7 ; SSE4-NEXT: .LBB9_8: # %else6 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB9_9 ; SSE4-NEXT: .LBB9_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB9_11 ; SSE4-NEXT: .LBB9_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB9_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB9_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) ; SSE4-NEXT: .LBB9_14: # %else12 ; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7] -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB9_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) ; SSE4-NEXT: .LBB9_16: # %else14 ; SSE4-NEXT: packusdw %xmm3, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB9_17 +; SSE4-NEXT: # %bb.18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB9_19 +; SSE4-NEXT: .LBB9_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB9_21 +; SSE4-NEXT: .LBB9_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB9_23 +; SSE4-NEXT: .LBB9_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB9_25 +; SSE4-NEXT: .LBB9_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB9_27 +; SSE4-NEXT: .LBB9_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB9_29 +; SSE4-NEXT: .LBB9_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB9_31 +; SSE4-NEXT: .LBB9_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB9_1: # %cond.store +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB9_4 +; SSE4-NEXT: .LBB9_3: # %cond.store1 +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB9_6 +; SSE4-NEXT: .LBB9_5: # %cond.store3 +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: je .LBB9_8 +; SSE4-NEXT: .LBB9_7: # %cond.store5 +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: je .LBB9_10 +; SSE4-NEXT: .LBB9_9: # %cond.store7 +; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je .LBB9_12 +; SSE4-NEXT: .LBB9_11: # %cond.store9 +; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB9_13 +; SSE4-NEXT: jmp .LBB9_14 +; SSE4-NEXT: .LBB9_17: # %cond.store15 ; SSE4-NEXT: pextrw $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB9_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB9_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB9_19: # %cond.store17 ; SSE4-NEXT: pextrw $1, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB9_20: # %else18 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm6, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB9_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB9_21: # %cond.store19 ; SSE4-NEXT: pextrw $2, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB9_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB9_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB9_23: # %cond.store21 ; SSE4-NEXT: pextrw $3, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB9_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB9_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB9_25: # %cond.store23 ; SSE4-NEXT: pextrw $4, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB9_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB9_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB9_27: # %cond.store25 ; SSE4-NEXT: pextrw $5, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB9_28: # %else26 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB9_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB9_29: # %cond.store27 ; SSE4-NEXT: pextrw $6, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB9_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB9_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB9_31: # %cond.store29 ; SSE4-NEXT: pextrw $7, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB9_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB9_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB9_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB9_3 +; AVX1-NEXT: .LBB9_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB9_5 +; AVX1-NEXT: .LBB9_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB9_7 +; AVX1-NEXT: .LBB9_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB9_9 +; AVX1-NEXT: .LBB9_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB9_11 +; AVX1-NEXT: .LBB9_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB9_13 +; AVX1-NEXT: .LBB9_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je .LBB9_16 +; AVX1-NEXT: .LBB9_15: # %cond.store13 +; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX1-NEXT: .LBB9_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: jne .LBB9_17 +; AVX1-NEXT: # %bb.18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB9_19 +; AVX1-NEXT: .LBB9_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB9_21 +; AVX1-NEXT: .LBB9_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB9_23 +; AVX1-NEXT: .LBB9_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB9_25 +; AVX1-NEXT: .LBB9_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB9_27 +; AVX1-NEXT: .LBB9_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB9_29 +; AVX1-NEXT: .LBB9_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB9_31 +; AVX1-NEXT: .LBB9_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB9_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB9_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB9_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB9_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB9_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB9_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB9_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB9_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB9_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB9_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB9_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB9_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB9_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB9_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB9_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB9_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB9_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB9_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB9_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB9_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB9_15 +; AVX1-NEXT: jmp .LBB9_16 +; AVX1-NEXT: .LBB9_17: # %cond.store15 ; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB9_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB9_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB9_19: # %cond.store17 ; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX1-NEXT: .LBB9_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB9_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB9_21: # %cond.store19 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX1-NEXT: .LBB9_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB9_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB9_23: # %cond.store21 ; AVX1-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB9_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB9_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB9_25: # %cond.store23 ; AVX1-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB9_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB9_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB9_27: # %cond.store25 ; AVX1-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB9_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB9_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB9_29: # %cond.store27 ; AVX1-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB9_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB9_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB9_31: # %cond.store29 ; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB9_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB9_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB9_3 +; AVX2-NEXT: .LBB9_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB9_5 +; AVX2-NEXT: .LBB9_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB9_7 +; AVX2-NEXT: .LBB9_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB9_9 +; AVX2-NEXT: .LBB9_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB9_11 +; AVX2-NEXT: .LBB9_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB9_13 +; AVX2-NEXT: .LBB9_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je .LBB9_16 +; AVX2-NEXT: .LBB9_15: # %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX2-NEXT: .LBB9_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB9_17 +; AVX2-NEXT: # %bb.18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB9_19 +; AVX2-NEXT: .LBB9_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB9_21 +; AVX2-NEXT: .LBB9_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB9_23 +; AVX2-NEXT: .LBB9_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB9_25 +; AVX2-NEXT: .LBB9_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB9_27 +; AVX2-NEXT: .LBB9_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB9_29 +; AVX2-NEXT: .LBB9_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB9_31 +; AVX2-NEXT: .LBB9_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB9_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB9_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB9_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB9_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB9_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB9_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB9_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB9_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB9_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB9_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB9_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB9_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB9_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB9_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB9_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB9_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB9_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB9_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB9_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB9_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB9_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB9_15 +; AVX2-NEXT: jmp .LBB9_16 +; AVX2-NEXT: .LBB9_17: # %cond.store15 ; AVX2-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB9_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB9_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB9_19: # %cond.store17 ; AVX2-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB9_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB9_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB9_21: # %cond.store19 ; AVX2-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB9_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB9_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB9_23: # %cond.store21 ; AVX2-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB9_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB9_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB9_25: # %cond.store23 ; AVX2-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB9_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB9_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB9_27: # %cond.store25 ; AVX2-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB9_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB9_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB9_29: # %cond.store27 ; AVX2-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB9_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB9_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB9_31: # %cond.store29 ; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB9_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2568,116 +2391,117 @@ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB9_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB9_3 +; AVX512F-NEXT: .LBB9_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB9_5 +; AVX512F-NEXT: .LBB9_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB9_7 +; AVX512F-NEXT: .LBB9_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB9_9 +; AVX512F-NEXT: .LBB9_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB9_11 +; AVX512F-NEXT: .LBB9_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB9_13 +; AVX512F-NEXT: .LBB9_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je .LBB9_16 +; AVX512F-NEXT: .LBB9_15: # %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512F-NEXT: .LBB9_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB9_17 +; AVX512F-NEXT: # %bb.18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB9_19 +; AVX512F-NEXT: .LBB9_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB9_21 +; AVX512F-NEXT: .LBB9_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB9_23 +; AVX512F-NEXT: .LBB9_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB9_25 +; AVX512F-NEXT: .LBB9_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB9_27 +; AVX512F-NEXT: .LBB9_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB9_29 +; AVX512F-NEXT: .LBB9_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB9_31 +; AVX512F-NEXT: .LBB9_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB9_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB9_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB9_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB9_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB9_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB9_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB9_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB9_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB9_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB9_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB9_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB9_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB9_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB9_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB9_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB9_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB9_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB9_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB9_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB9_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB9_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB9_15 +; AVX512F-NEXT: jmp .LBB9_16 +; AVX512F-NEXT: .LBB9_17: # %cond.store15 ; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB9_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB9_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB9_19: # %cond.store17 ; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB9_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB9_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB9_21: # %cond.store19 ; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB9_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB9_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB9_23: # %cond.store21 ; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB9_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB9_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB9_25: # %cond.store23 ; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB9_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB9_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB9_27: # %cond.store25 ; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB9_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB9_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB9_29: # %cond.store27 ; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB9_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB9_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB9_31: # %cond.store29 ; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB9_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2704,7 +2528,6 @@ ; SSE2-LABEL: truncstore_v16i32_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm9, %xmm2 @@ -2713,158 +2536,124 @@ ; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB10_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB10_2: # %else +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB10_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB10_3 ; SSE2-NEXT: .LBB10_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB10_5 ; SSE2-NEXT: .LBB10_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB10_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB10_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB10_10: # %else8 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB10_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: movb %ah, 5(%rdi) +; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB10_14: # %else12 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB10_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: movb %ah, 7(%rdi) +; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movb %al, 8(%rdi) +; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB10_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB10_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: movb %ah, 9(%rdi) +; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: movb %al, 10(%rdi) +; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB10_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB10_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: movb %ah, 11(%rdi) +; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: movb %al, 12(%rdi) +; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB10_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB10_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: movb %ah, 13(%rdi) +; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: je .LBB10_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: movb %al, 14(%rdi) -; SSE2-NEXT: .LBB10_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 -; SSE2-NEXT: movb %ah, 15(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: jne .LBB10_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB10_31 ; SSE2-NEXT: .LBB10_32: # %else30 ; SSE2-NEXT: retq +; SSE2-NEXT: .LBB10_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB10_4 +; SSE2-NEXT: .LBB10_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB10_6 +; SSE2-NEXT: .LBB10_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB10_7 +; SSE2-NEXT: jmp .LBB10_8 +; SSE2-NEXT: .LBB10_29: # %cond.store27 +; SSE2-NEXT: movb %cl, 14(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: je .LBB10_32 +; SSE2-NEXT: .LBB10_31: # %cond.store29 +; SSE2-NEXT: movb %ch, 15(%rdi) +; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm9, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE4-NEXT: pand %xmm9, %xmm3 ; SSE4-NEXT: pand %xmm9, %xmm2 @@ -2873,496 +2662,407 @@ ; SSE4-NEXT: pand %xmm9, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 ; SSE4-NEXT: packuswb %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB10_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB10_3 +; SSE4-NEXT: .LBB10_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB10_5 +; SSE4-NEXT: .LBB10_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB10_7 +; SSE4-NEXT: .LBB10_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB10_9 +; SSE4-NEXT: .LBB10_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB10_11 +; SSE4-NEXT: .LBB10_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB10_13 +; SSE4-NEXT: .LBB10_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB10_15 +; SSE4-NEXT: .LBB10_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB10_17 +; SSE4-NEXT: .LBB10_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB10_19 +; SSE4-NEXT: .LBB10_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB10_21 +; SSE4-NEXT: .LBB10_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB10_23 +; SSE4-NEXT: .LBB10_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB10_25 +; SSE4-NEXT: .LBB10_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB10_27 +; SSE4-NEXT: .LBB10_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB10_29 +; SSE4-NEXT: .LBB10_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB10_31 +; SSE4-NEXT: .LBB10_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB10_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB10_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB10_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB10_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB10_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB10_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB10_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB10_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB10_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB10_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB10_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB10_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB10_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB10_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB10_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB10_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB10_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB10_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB10_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB10_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB10_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB10_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB10_16: # %else14 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB10_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB10_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB10_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB10_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB10_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB10_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm6, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB10_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB10_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB10_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB10_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB10_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB10_24: # %else22 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB10_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB10_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB10_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB10_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB10_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB10_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB10_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB10_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB10_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB10_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB10_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB10_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB10_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB10_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB10_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB10_3 +; AVX1-NEXT: .LBB10_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB10_5 +; AVX1-NEXT: .LBB10_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB10_7 +; AVX1-NEXT: .LBB10_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB10_9 +; AVX1-NEXT: .LBB10_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB10_11 +; AVX1-NEXT: .LBB10_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB10_13 +; AVX1-NEXT: .LBB10_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB10_15 +; AVX1-NEXT: .LBB10_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB10_17 +; AVX1-NEXT: .LBB10_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB10_19 +; AVX1-NEXT: .LBB10_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB10_21 +; AVX1-NEXT: .LBB10_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB10_23 +; AVX1-NEXT: .LBB10_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB10_25 +; AVX1-NEXT: .LBB10_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB10_27 +; AVX1-NEXT: .LBB10_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB10_29 +; AVX1-NEXT: .LBB10_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB10_31 +; AVX1-NEXT: .LBB10_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB10_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB10_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB10_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB10_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB10_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB10_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB10_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB10_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB10_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB10_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB10_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB10_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB10_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB10_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB10_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB10_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB10_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB10_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB10_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB10_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB10_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB10_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB10_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB10_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB10_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB10_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB10_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB10_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB10_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB10_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB10_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB10_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB10_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB10_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB10_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB10_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB10_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB10_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB10_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB10_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB10_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB10_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB10_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB10_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB10_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB10_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX2-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB10_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB10_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB10_3 +; AVX2-NEXT: .LBB10_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB10_5 +; AVX2-NEXT: .LBB10_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB10_7 +; AVX2-NEXT: .LBB10_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB10_9 +; AVX2-NEXT: .LBB10_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB10_11 +; AVX2-NEXT: .LBB10_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB10_13 +; AVX2-NEXT: .LBB10_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB10_15 +; AVX2-NEXT: .LBB10_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB10_17 +; AVX2-NEXT: .LBB10_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB10_19 +; AVX2-NEXT: .LBB10_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB10_21 +; AVX2-NEXT: .LBB10_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB10_23 +; AVX2-NEXT: .LBB10_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB10_25 +; AVX2-NEXT: .LBB10_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB10_27 +; AVX2-NEXT: .LBB10_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB10_29 +; AVX2-NEXT: .LBB10_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB10_31 +; AVX2-NEXT: .LBB10_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB10_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB10_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB10_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB10_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB10_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB10_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB10_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB10_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB10_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB10_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB10_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB10_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB10_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB10_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB10_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB10_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB10_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB10_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB10_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB10_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB10_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB10_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB10_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB10_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB10_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB10_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB10_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB10_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB10_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB10_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB10_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB10_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB10_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB10_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB10_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB10_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB10_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB10_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB10_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB10_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB10_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB10_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 -; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB10_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: .LBB10_29: # %cond.store27 +; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB10_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB10_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB10_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3372,115 +3072,117 @@ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB10_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB10_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB10_3 +; AVX512F-NEXT: .LBB10_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB10_5 +; AVX512F-NEXT: .LBB10_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB10_7 +; AVX512F-NEXT: .LBB10_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB10_9 +; AVX512F-NEXT: .LBB10_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB10_11 +; AVX512F-NEXT: .LBB10_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB10_13 +; AVX512F-NEXT: .LBB10_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB10_15 +; AVX512F-NEXT: .LBB10_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB10_17 +; AVX512F-NEXT: .LBB10_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB10_19 +; AVX512F-NEXT: .LBB10_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB10_21 +; AVX512F-NEXT: .LBB10_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB10_23 +; AVX512F-NEXT: .LBB10_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB10_25 +; AVX512F-NEXT: .LBB10_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB10_27 +; AVX512F-NEXT: .LBB10_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB10_29 +; AVX512F-NEXT: .LBB10_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB10_31 +; AVX512F-NEXT: .LBB10_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB10_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB10_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB10_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB10_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB10_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB10_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB10_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB10_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB10_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB10_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB10_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB10_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB10_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB10_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB10_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB10_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB10_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB10_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB10_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB10_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB10_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB10_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB10_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB10_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB10_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB10_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB10_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB10_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB10_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB10_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB10_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB10_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB10_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB10_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB10_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB10_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB10_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB10_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB10_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB10_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB10_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB10_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB10_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB10_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB10_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB10_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB10_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3506,313 +3208,294 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB11_2: # %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB11_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: jne .LBB11_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB11_3 +; SSE2-NEXT: .LBB11_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB11_5 ; SSE2-NEXT: .LBB11_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB11_7 ; SSE2-NEXT: .LBB11_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB11_9 ; SSE2-NEXT: .LBB11_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB11_11 ; SSE2-NEXT: .LBB11_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB11_13 ; SSE2-NEXT: .LBB11_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB11_15 +; SSE2-NEXT: .LBB11_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB11_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB11_4 +; SSE2-NEXT: .LBB11_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB11_6 +; SSE2-NEXT: .LBB11_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB11_8 +; SSE2-NEXT: .LBB11_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB11_10 +; SSE2-NEXT: .LBB11_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB11_12 +; SSE2-NEXT: .LBB11_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB11_14 +; SSE2-NEXT: .LBB11_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB11_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB11_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB11_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pshufb %xmm5, %xmm1 ; SSE4-NEXT: pshufb %xmm5, %xmm0 ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB11_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB11_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB11_3 +; SSE4-NEXT: .LBB11_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB11_5 +; SSE4-NEXT: .LBB11_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB11_7 +; SSE4-NEXT: .LBB11_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB11_9 +; SSE4-NEXT: .LBB11_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB11_11 +; SSE4-NEXT: .LBB11_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB11_13 +; SSE4-NEXT: .LBB11_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB11_15 +; SSE4-NEXT: .LBB11_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB11_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB11_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB11_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB11_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB11_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB11_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB11_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB11_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB11_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB11_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB11_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB11_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB11_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB11_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB11_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB11_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB11_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB11_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB11_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB11_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB11_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB11_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB11_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB11_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB11_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB11_3 +; AVX1-NEXT: .LBB11_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB11_5 +; AVX1-NEXT: .LBB11_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB11_7 +; AVX1-NEXT: .LBB11_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB11_9 +; AVX1-NEXT: .LBB11_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB11_11 +; AVX1-NEXT: .LBB11_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB11_13 +; AVX1-NEXT: .LBB11_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB11_15 +; AVX1-NEXT: .LBB11_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB11_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB11_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB11_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB11_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB11_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB11_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB11_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB11_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB11_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB11_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB11_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB11_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB11_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB11_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB11_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB11_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB11_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB11_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB11_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB11_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB11_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB11_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB11_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB11_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB11_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB11_3 +; AVX2-NEXT: .LBB11_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB11_5 +; AVX2-NEXT: .LBB11_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB11_7 +; AVX2-NEXT: .LBB11_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB11_9 +; AVX2-NEXT: .LBB11_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB11_11 +; AVX2-NEXT: .LBB11_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB11_13 +; AVX2-NEXT: .LBB11_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB11_15 +; AVX2-NEXT: .LBB11_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB11_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB11_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB11_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB11_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB11_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB11_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB11_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB11_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB11_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB11_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB11_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB11_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB11_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB11_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB11_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB11_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB11_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB11_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB11_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB11_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB11_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB11_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB11_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3824,66 +3507,61 @@ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB11_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB11_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB11_3 +; AVX512F-NEXT: .LBB11_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB11_5 +; AVX512F-NEXT: .LBB11_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB11_7 +; AVX512F-NEXT: .LBB11_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB11_9 +; AVX512F-NEXT: .LBB11_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB11_11 +; AVX512F-NEXT: .LBB11_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB11_13 +; AVX512F-NEXT: .LBB11_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB11_15 +; AVX512F-NEXT: .LBB11_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB11_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB11_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB11_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB11_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB11_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB11_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB11_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB11_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB11_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB11_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB11_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB11_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB11_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB11_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB11_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB11_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB11_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB11_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB11_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB11_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB11_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB11_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB11_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3914,313 +3592,294 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB12_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB12_2: # %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB12_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB12_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB12_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB12_3 +; SSE2-NEXT: .LBB12_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB12_5 +; SSE2-NEXT: .LBB12_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB12_7 +; SSE2-NEXT: .LBB12_8: # %else6 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB12_9 +; SSE2-NEXT: .LBB12_10: # %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB12_11 +; SSE2-NEXT: .LBB12_12: # %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB12_13 +; SSE2-NEXT: .LBB12_14: # %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB12_15 +; SSE2-NEXT: .LBB12_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB12_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB12_4 +; SSE2-NEXT: .LBB12_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB12_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB12_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB12_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB12_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB12_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) -; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB12_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB12_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) -; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB12_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB12_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB12_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB12_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB12_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pshufb %xmm5, %xmm1 ; SSE4-NEXT: pshufb %xmm5, %xmm0 ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB12_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB12_3 +; SSE4-NEXT: .LBB12_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB12_5 +; SSE4-NEXT: .LBB12_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB12_7 +; SSE4-NEXT: .LBB12_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB12_9 +; SSE4-NEXT: .LBB12_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB12_11 +; SSE4-NEXT: .LBB12_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB12_13 +; SSE4-NEXT: .LBB12_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB12_15 +; SSE4-NEXT: .LBB12_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB12_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB12_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB12_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB12_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB12_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB12_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB12_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB12_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB12_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB12_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB12_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB12_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB12_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB12_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB12_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB12_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB12_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB12_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB12_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB12_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB12_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB12_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB12_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB12_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB12_3 +; AVX1-NEXT: .LBB12_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB12_5 +; AVX1-NEXT: .LBB12_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB12_7 +; AVX1-NEXT: .LBB12_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB12_9 +; AVX1-NEXT: .LBB12_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB12_11 +; AVX1-NEXT: .LBB12_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB12_13 +; AVX1-NEXT: .LBB12_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB12_15 +; AVX1-NEXT: .LBB12_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB12_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB12_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB12_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB12_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB12_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB12_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB12_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB12_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB12_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB12_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB12_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB12_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB12_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB12_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB12_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB12_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB12_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB12_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB12_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB12_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB12_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB12_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB12_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB12_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB12_3 +; AVX2-NEXT: .LBB12_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB12_5 +; AVX2-NEXT: .LBB12_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB12_7 +; AVX2-NEXT: .LBB12_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB12_9 +; AVX2-NEXT: .LBB12_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB12_11 +; AVX2-NEXT: .LBB12_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB12_13 +; AVX2-NEXT: .LBB12_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB12_15 +; AVX2-NEXT: .LBB12_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB12_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB12_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB12_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB12_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB12_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB12_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB12_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB12_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB12_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB12_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB12_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB12_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB12_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB12_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB12_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB12_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB12_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB12_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB12_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB12_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB12_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB12_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4232,66 +3891,61 @@ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB12_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB12_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB12_3 +; AVX512F-NEXT: .LBB12_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB12_5 +; AVX512F-NEXT: .LBB12_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB12_7 +; AVX512F-NEXT: .LBB12_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB12_9 +; AVX512F-NEXT: .LBB12_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB12_11 +; AVX512F-NEXT: .LBB12_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB12_13 +; AVX512F-NEXT: .LBB12_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB12_15 +; AVX512F-NEXT: .LBB12_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB12_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB12_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB12_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB12_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB12_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB12_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB12_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB12_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB12_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB12_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB12_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB12_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB12_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB12_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB12_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB12_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB12_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB12_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB12_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB12_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB12_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB12_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4325,114 +3979,109 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB13_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB13_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB13_3 ; SSE2-NEXT: .LBB13_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB13_5 ; SSE2-NEXT: .LBB13_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB13_7 +; SSE2-NEXT: .LBB13_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB13_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB13_4 +; SSE2-NEXT: .LBB13_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB13_6 +; SSE2-NEXT: .LBB13_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB13_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB13_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB13_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB13_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB13_3 +; SSE4-NEXT: .LBB13_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB13_5 +; SSE4-NEXT: .LBB13_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB13_7 +; SSE4-NEXT: .LBB13_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB13_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB13_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB13_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB13_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB13_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB13_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB13_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB13_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB13_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB13_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v4i32_v4i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB13_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB13_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB13_3 +; AVX-NEXT: .LBB13_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB13_5 +; AVX-NEXT: .LBB13_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB13_7 +; AVX-NEXT: .LBB13_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB13_1: # %cond.store ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB13_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB13_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB13_3: # %cond.store1 ; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB13_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB13_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB13_5: # %cond.store3 ; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB13_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB13_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB13_7: # %cond.store5 ; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB13_8: # %else6 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i16: @@ -4441,34 +4090,33 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB13_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB13_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB13_3 +; AVX512F-NEXT: .LBB13_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB13_5 +; AVX512F-NEXT: .LBB13_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB13_7 +; AVX512F-NEXT: .LBB13_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB13_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB13_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB13_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB13_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB13_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB13_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB13_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB13_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB13_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB13_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4499,114 +4147,109 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB14_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: jne .LBB14_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB14_3 ; SSE2-NEXT: .LBB14_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB14_5 ; SSE2-NEXT: .LBB14_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB14_7 +; SSE2-NEXT: .LBB14_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB14_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB14_4 +; SSE2-NEXT: .LBB14_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB14_6 +; SSE2-NEXT: .LBB14_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB14_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB14_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB14_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB14_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB14_3 +; SSE4-NEXT: .LBB14_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB14_5 +; SSE4-NEXT: .LBB14_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB14_7 +; SSE4-NEXT: .LBB14_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB14_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB14_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB14_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB14_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB14_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB14_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB14_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB14_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB14_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB14_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v4i32_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB14_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB14_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB14_3 +; AVX-NEXT: .LBB14_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB14_5 +; AVX-NEXT: .LBB14_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB14_7 +; AVX-NEXT: .LBB14_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB14_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB14_2: # %else -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB14_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB14_3: # %cond.store1 ; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB14_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB14_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB14_5: # %cond.store3 ; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB14_6: # %else4 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB14_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB14_7: # %cond.store5 ; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB14_8: # %else6 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: @@ -4615,34 +4258,33 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB14_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB14_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB14_3 +; AVX512F-NEXT: .LBB14_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB14_5 +; AVX512F-NEXT: .LBB14_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB14_7 +; AVX512F-NEXT: .LBB14_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB14_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB14_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB14_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB14_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB14_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB14_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB14_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB14_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB14_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB14_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4672,1515 +4314,1182 @@ ; SSE2-LABEL: truncstore_v32i16_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm7, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm7, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %ecx +; SSE2-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm7, %xmm5 +; SSE2-NEXT: pmovmskb %xmm5, %eax +; SSE2-NEXT: notl %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB15_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB15_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB15_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB15_3 ; SSE2-NEXT: .LBB15_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB15_5 ; SSE2-NEXT: .LBB15_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB15_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB15_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB15_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB15_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB15_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB15_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB15_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB15_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB15_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB15_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB15_26: # %else24 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB15_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB15_28: # %else26 ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) ; SSE2-NEXT: .LBB15_30: # %else28 ; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB15_32 ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: je .LBB15_34 -; SSE2-NEXT: # %bb.33: # %cond.store31 -; SSE2-NEXT: movb %al, 16(%rdi) -; SSE2-NEXT: .LBB15_34: # %else32 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_36 -; SSE2-NEXT: # %bb.35: # %cond.store33 -; SSE2-NEXT: movb %ah, 17(%rdi) +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: jne .LBB15_33 +; SSE2-NEXT: # %bb.34: # %else32 +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: jne .LBB15_35 ; SSE2-NEXT: .LBB15_36: # %else34 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_38 -; SSE2-NEXT: # %bb.37: # %cond.store35 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: jne .LBB15_37 ; SSE2-NEXT: .LBB15_38: # %else36 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE2-NEXT: je .LBB15_40 -; SSE2-NEXT: # %bb.39: # %cond.store37 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 19(%rdi) +; SSE2-NEXT: .LBB15_39: # %cond.store37 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) ; SSE2-NEXT: .LBB15_42: # %else40 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE2-NEXT: je .LBB15_44 ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) ; SSE2-NEXT: .LBB15_46: # %else44 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE2-NEXT: je .LBB15_48 ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) ; SSE2-NEXT: .LBB15_50: # %else48 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE2-NEXT: je .LBB15_52 ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) ; SSE2-NEXT: .LBB15_54: # %else52 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE2-NEXT: je .LBB15_56 ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) ; SSE2-NEXT: .LBB15_58: # %else56 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE2-NEXT: je .LBB15_60 ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE2-NEXT: pextrw $7, %xmm5, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx -; SSE2-NEXT: je .LBB15_62 -; SSE2-NEXT: # %bb.61: # %cond.store59 +; SSE2-NEXT: jne .LBB15_61 +; SSE2-NEXT: # %bb.62: # %else60 +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE2-NEXT: jne .LBB15_63 +; SSE2-NEXT: .LBB15_64: # %else62 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB15_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB15_4 +; SSE2-NEXT: .LBB15_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB15_6 +; SSE2-NEXT: .LBB15_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB15_7 +; SSE2-NEXT: jmp .LBB15_8 +; SSE2-NEXT: .LBB15_33: # %cond.store31 +; SSE2-NEXT: movb %cl, 16(%rdi) +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: je .LBB15_36 +; SSE2-NEXT: .LBB15_35: # %cond.store33 +; SSE2-NEXT: movb %ch, 17(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: je .LBB15_38 +; SSE2-NEXT: .LBB15_37: # %cond.store35 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE2-NEXT: jne .LBB15_39 +; SSE2-NEXT: jmp .LBB15_40 +; SSE2-NEXT: .LBB15_61: # %cond.store59 ; SSE2-NEXT: movb %cl, 30(%rdi) -; SSE2-NEXT: .LBB15_62: # %else60 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE2-NEXT: je .LBB15_64 -; SSE2-NEXT: # %bb.63: # %cond.store61 +; SSE2-NEXT: .LBB15_63: # %cond.store61 ; SSE2-NEXT: movb %ch, 31(%rdi) -; SSE2-NEXT: .LBB15_64: # %else62 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; SSE4-NEXT: pshufb %xmm6, %xmm1 ; SSE4-NEXT: pshufb %xmm6, %xmm0 ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE4-NEXT: pextrb $0, %xmm7, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqb %xmm7, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %ecx +; SSE4-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE4-NEXT: pcmpeqb %xmm7, %xmm5 +; SSE4-NEXT: pmovmskb %xmm5, %eax +; SSE4-NEXT: notl %eax +; SSE4-NEXT: shll $16, %eax +; SSE4-NEXT: orl %ecx, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB15_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB15_3 +; SSE4-NEXT: .LBB15_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB15_5 +; SSE4-NEXT: .LBB15_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB15_7 +; SSE4-NEXT: .LBB15_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB15_9 +; SSE4-NEXT: .LBB15_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB15_11 +; SSE4-NEXT: .LBB15_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB15_13 +; SSE4-NEXT: .LBB15_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB15_15 +; SSE4-NEXT: .LBB15_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB15_17 +; SSE4-NEXT: .LBB15_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB15_19 +; SSE4-NEXT: .LBB15_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB15_21 +; SSE4-NEXT: .LBB15_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB15_23 +; SSE4-NEXT: .LBB15_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB15_25 +; SSE4-NEXT: .LBB15_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB15_27 +; SSE4-NEXT: .LBB15_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: je .LBB15_30 +; SSE4-NEXT: .LBB15_29: # %cond.store27 +; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) +; SSE4-NEXT: .LBB15_30: # %else28 +; SSE4-NEXT: pshufb %xmm6, %xmm3 +; SSE4-NEXT: pshufb %xmm6, %xmm2 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: je .LBB15_32 +; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) +; SSE4-NEXT: .LBB15_32: # %else30 +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE4-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE4-NEXT: jne .LBB15_33 +; SSE4-NEXT: # %bb.34: # %else32 +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE4-NEXT: jne .LBB15_35 +; SSE4-NEXT: .LBB15_36: # %else34 +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE4-NEXT: jne .LBB15_37 +; SSE4-NEXT: .LBB15_38: # %else36 +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE4-NEXT: jne .LBB15_39 +; SSE4-NEXT: .LBB15_40: # %else38 +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 +; SSE4-NEXT: jne .LBB15_41 +; SSE4-NEXT: .LBB15_42: # %else40 +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 +; SSE4-NEXT: jne .LBB15_43 +; SSE4-NEXT: .LBB15_44: # %else42 +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 +; SSE4-NEXT: jne .LBB15_45 +; SSE4-NEXT: .LBB15_46: # %else44 +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 +; SSE4-NEXT: jne .LBB15_47 +; SSE4-NEXT: .LBB15_48: # %else46 +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 +; SSE4-NEXT: jne .LBB15_49 +; SSE4-NEXT: .LBB15_50: # %else48 +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 +; SSE4-NEXT: jne .LBB15_51 +; SSE4-NEXT: .LBB15_52: # %else50 +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 +; SSE4-NEXT: jne .LBB15_53 +; SSE4-NEXT: .LBB15_54: # %else52 +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 +; SSE4-NEXT: jne .LBB15_55 +; SSE4-NEXT: .LBB15_56: # %else54 +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 +; SSE4-NEXT: jne .LBB15_57 +; SSE4-NEXT: .LBB15_58: # %else56 +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 +; SSE4-NEXT: jne .LBB15_59 +; SSE4-NEXT: .LBB15_60: # %else58 +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; SSE4-NEXT: jne .LBB15_61 +; SSE4-NEXT: .LBB15_62: # %else60 +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE4-NEXT: jne .LBB15_63 +; SSE4-NEXT: .LBB15_64: # %else62 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB15_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB15_2: # %else -; SSE4-NEXT: pextrb $1, %xmm7, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB15_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB15_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB15_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB15_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB15_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB15_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB15_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB15_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB15_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB15_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB15_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB15_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB15_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB15_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB15_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB15_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB15_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB15_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB15_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB15_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB15_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB15_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB15_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB15_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB15_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB15_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB15_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB15_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB15_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB15_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB15_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB15_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB15_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB15_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB15_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB15_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB15_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB15_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB15_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE4-NEXT: pextrb $14, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 -; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB15_30: # %else28 -; SSE4-NEXT: pshufb %xmm6, %xmm3 -; SSE4-NEXT: pshufb %xmm6, %xmm2 -; SSE4-NEXT: pextrb $15, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 -; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB15_32: # %else30 -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_34 -; SSE4-NEXT: # %bb.33: # %cond.store31 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB15_29 +; SSE4-NEXT: jmp .LBB15_30 +; SSE4-NEXT: .LBB15_33: # %cond.store31 ; SSE4-NEXT: pextrb $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB15_34: # %else32 -; SSE4-NEXT: pextrb $1, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 ; SSE4-NEXT: je .LBB15_36 -; SSE4-NEXT: # %bb.35: # %cond.store33 +; SSE4-NEXT: .LBB15_35: # %cond.store33 ; SSE4-NEXT: pextrb $1, %xmm2, 17(%rdi) -; SSE4-NEXT: .LBB15_36: # %else34 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $2, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 ; SSE4-NEXT: je .LBB15_38 -; SSE4-NEXT: # %bb.37: # %cond.store35 +; SSE4-NEXT: .LBB15_37: # %cond.store35 ; SSE4-NEXT: pextrb $2, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB15_38: # %else36 -; SSE4-NEXT: pextrb $3, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE4-NEXT: je .LBB15_40 -; SSE4-NEXT: # %bb.39: # %cond.store37 +; SSE4-NEXT: .LBB15_39: # %cond.store37 ; SSE4-NEXT: pextrb $3, %xmm2, 19(%rdi) -; SSE4-NEXT: .LBB15_40: # %else38 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE4-NEXT: je .LBB15_42 -; SSE4-NEXT: # %bb.41: # %cond.store39 +; SSE4-NEXT: .LBB15_41: # %cond.store39 ; SSE4-NEXT: pextrb $4, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB15_42: # %else40 -; SSE4-NEXT: pextrb $5, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE4-NEXT: je .LBB15_44 -; SSE4-NEXT: # %bb.43: # %cond.store41 +; SSE4-NEXT: .LBB15_43: # %cond.store41 ; SSE4-NEXT: pextrb $5, %xmm2, 21(%rdi) -; SSE4-NEXT: .LBB15_44: # %else42 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $6, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE4-NEXT: je .LBB15_46 -; SSE4-NEXT: # %bb.45: # %cond.store43 +; SSE4-NEXT: .LBB15_45: # %cond.store43 ; SSE4-NEXT: pextrb $6, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB15_46: # %else44 -; SSE4-NEXT: pextrb $7, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE4-NEXT: je .LBB15_48 -; SSE4-NEXT: # %bb.47: # %cond.store45 +; SSE4-NEXT: .LBB15_47: # %cond.store45 ; SSE4-NEXT: pextrb $7, %xmm2, 23(%rdi) -; SSE4-NEXT: .LBB15_48: # %else46 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE4-NEXT: je .LBB15_50 -; SSE4-NEXT: # %bb.49: # %cond.store47 +; SSE4-NEXT: .LBB15_49: # %cond.store47 ; SSE4-NEXT: pextrb $8, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB15_50: # %else48 -; SSE4-NEXT: pextrb $9, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE4-NEXT: je .LBB15_52 -; SSE4-NEXT: # %bb.51: # %cond.store49 +; SSE4-NEXT: .LBB15_51: # %cond.store49 ; SSE4-NEXT: pextrb $9, %xmm2, 25(%rdi) -; SSE4-NEXT: .LBB15_52: # %else50 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $10, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE4-NEXT: je .LBB15_54 -; SSE4-NEXT: # %bb.53: # %cond.store51 +; SSE4-NEXT: .LBB15_53: # %cond.store51 ; SSE4-NEXT: pextrb $10, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB15_54: # %else52 -; SSE4-NEXT: pextrb $11, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE4-NEXT: je .LBB15_56 -; SSE4-NEXT: # %bb.55: # %cond.store53 +; SSE4-NEXT: .LBB15_55: # %cond.store53 ; SSE4-NEXT: pextrb $11, %xmm2, 27(%rdi) -; SSE4-NEXT: .LBB15_56: # %else54 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE4-NEXT: je .LBB15_58 -; SSE4-NEXT: # %bb.57: # %cond.store55 +; SSE4-NEXT: .LBB15_57: # %cond.store55 ; SSE4-NEXT: pextrb $12, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB15_58: # %else56 -; SSE4-NEXT: pextrb $13, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE4-NEXT: je .LBB15_60 -; SSE4-NEXT: # %bb.59: # %cond.store57 +; SSE4-NEXT: .LBB15_59: # %cond.store57 ; SSE4-NEXT: pextrb $13, %xmm2, 29(%rdi) -; SSE4-NEXT: .LBB15_60: # %else58 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE4-NEXT: pextrb $14, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE4-NEXT: je .LBB15_62 -; SSE4-NEXT: # %bb.61: # %cond.store59 +; SSE4-NEXT: .LBB15_61: # %cond.store59 ; SSE4-NEXT: pextrb $14, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB15_62: # %else60 -; SSE4-NEXT: pextrb $15, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE4-NEXT: je .LBB15_64 -; SSE4-NEXT: # %bb.63: # %cond.store61 +; SSE4-NEXT: .LBB15_63: # %cond.store61 ; SSE4-NEXT: pextrb $15, %xmm2, 31(%rdi) -; SSE4-NEXT: .LBB15_64: # %else62 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpmovmskb %xmm3, %ecx +; AVX1-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB15_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB15_3 +; AVX1-NEXT: .LBB15_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB15_5 +; AVX1-NEXT: .LBB15_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB15_7 +; AVX1-NEXT: .LBB15_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB15_9 +; AVX1-NEXT: .LBB15_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB15_11 +; AVX1-NEXT: .LBB15_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB15_13 +; AVX1-NEXT: .LBB15_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB15_15 +; AVX1-NEXT: .LBB15_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB15_17 +; AVX1-NEXT: .LBB15_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB15_19 +; AVX1-NEXT: .LBB15_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB15_21 +; AVX1-NEXT: .LBB15_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB15_23 +; AVX1-NEXT: .LBB15_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB15_25 +; AVX1-NEXT: .LBB15_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB15_27 +; AVX1-NEXT: .LBB15_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB15_29 +; AVX1-NEXT: .LBB15_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: je .LBB15_32 +; AVX1-NEXT: .LBB15_31: # %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX1-NEXT: .LBB15_32: # %else30 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: jne .LBB15_33 +; AVX1-NEXT: # %bb.34: # %else32 +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX1-NEXT: jne .LBB15_35 +; AVX1-NEXT: .LBB15_36: # %else34 +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX1-NEXT: jne .LBB15_37 +; AVX1-NEXT: .LBB15_38: # %else36 +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX1-NEXT: jne .LBB15_39 +; AVX1-NEXT: .LBB15_40: # %else38 +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX1-NEXT: jne .LBB15_41 +; AVX1-NEXT: .LBB15_42: # %else40 +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX1-NEXT: jne .LBB15_43 +; AVX1-NEXT: .LBB15_44: # %else42 +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX1-NEXT: jne .LBB15_45 +; AVX1-NEXT: .LBB15_46: # %else44 +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX1-NEXT: jne .LBB15_47 +; AVX1-NEXT: .LBB15_48: # %else46 +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX1-NEXT: jne .LBB15_49 +; AVX1-NEXT: .LBB15_50: # %else48 +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX1-NEXT: jne .LBB15_51 +; AVX1-NEXT: .LBB15_52: # %else50 +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX1-NEXT: jne .LBB15_53 +; AVX1-NEXT: .LBB15_54: # %else52 +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX1-NEXT: jne .LBB15_55 +; AVX1-NEXT: .LBB15_56: # %else54 +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX1-NEXT: jne .LBB15_57 +; AVX1-NEXT: .LBB15_58: # %else56 +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX1-NEXT: jne .LBB15_59 +; AVX1-NEXT: .LBB15_60: # %else58 +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX1-NEXT: jne .LBB15_61 +; AVX1-NEXT: .LBB15_62: # %else60 +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX1-NEXT: jne .LBB15_63 +; AVX1-NEXT: .LBB15_64: # %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB15_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB15_2: # %else -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB15_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB15_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB15_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $2, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB15_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB15_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB15_6: # %else4 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB15_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB15_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB15_8: # %else6 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB15_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB15_10: # %else8 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: .LBB15_9: # %cond.store7 +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB15_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB15_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB15_12: # %else10 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $6, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB15_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB15_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB15_14: # %else12 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB15_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB15_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB15_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB15_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB15_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB15_18: # %else16 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB15_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB15_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB15_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB15_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB15_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB15_22: # %else20 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB15_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB15_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB15_24: # %else22 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $12, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB15_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB15_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB15_26: # %else24 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB15_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB15_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB15_28: # %else26 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $14, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB15_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB15_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB15_30: # %else28 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 -; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB15_34 -; AVX1-NEXT: # %bb.33: # %cond.store31 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB15_31 +; AVX1-NEXT: jmp .LBB15_32 +; AVX1-NEXT: .LBB15_33: # %cond.store31 ; AVX1-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB15_34: # %else32 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX1-NEXT: je .LBB15_36 -; AVX1-NEXT: # %bb.35: # %cond.store33 +; AVX1-NEXT: .LBB15_35: # %cond.store33 ; AVX1-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX1-NEXT: .LBB15_36: # %else34 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX1-NEXT: je .LBB15_38 -; AVX1-NEXT: # %bb.37: # %cond.store35 +; AVX1-NEXT: .LBB15_37: # %cond.store35 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX1-NEXT: .LBB15_38: # %else36 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX1-NEXT: je .LBB15_40 -; AVX1-NEXT: # %bb.39: # %cond.store37 +; AVX1-NEXT: .LBB15_39: # %cond.store37 ; AVX1-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX1-NEXT: .LBB15_40: # %else38 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX1-NEXT: je .LBB15_42 -; AVX1-NEXT: # %bb.41: # %cond.store39 +; AVX1-NEXT: .LBB15_41: # %cond.store39 ; AVX1-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX1-NEXT: .LBB15_42: # %else40 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX1-NEXT: je .LBB15_44 -; AVX1-NEXT: # %bb.43: # %cond.store41 +; AVX1-NEXT: .LBB15_43: # %cond.store41 ; AVX1-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX1-NEXT: .LBB15_44: # %else42 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX1-NEXT: je .LBB15_46 -; AVX1-NEXT: # %bb.45: # %cond.store43 +; AVX1-NEXT: .LBB15_45: # %cond.store43 ; AVX1-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB15_46: # %else44 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX1-NEXT: je .LBB15_48 -; AVX1-NEXT: # %bb.47: # %cond.store45 +; AVX1-NEXT: .LBB15_47: # %cond.store45 ; AVX1-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX1-NEXT: .LBB15_48: # %else46 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX1-NEXT: je .LBB15_50 -; AVX1-NEXT: # %bb.49: # %cond.store47 +; AVX1-NEXT: .LBB15_49: # %cond.store47 ; AVX1-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB15_50: # %else48 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX1-NEXT: je .LBB15_52 -; AVX1-NEXT: # %bb.51: # %cond.store49 +; AVX1-NEXT: .LBB15_51: # %cond.store49 ; AVX1-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX1-NEXT: .LBB15_52: # %else50 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX1-NEXT: je .LBB15_54 -; AVX1-NEXT: # %bb.53: # %cond.store51 +; AVX1-NEXT: .LBB15_53: # %cond.store51 ; AVX1-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB15_54: # %else52 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX1-NEXT: je .LBB15_56 -; AVX1-NEXT: # %bb.55: # %cond.store53 +; AVX1-NEXT: .LBB15_55: # %cond.store53 ; AVX1-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX1-NEXT: .LBB15_56: # %else54 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX1-NEXT: je .LBB15_58 -; AVX1-NEXT: # %bb.57: # %cond.store55 +; AVX1-NEXT: .LBB15_57: # %cond.store55 ; AVX1-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB15_58: # %else56 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX1-NEXT: je .LBB15_60 -; AVX1-NEXT: # %bb.59: # %cond.store57 +; AVX1-NEXT: .LBB15_59: # %cond.store57 ; AVX1-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX1-NEXT: .LBB15_60: # %else58 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX1-NEXT: je .LBB15_62 -; AVX1-NEXT: # %bb.61: # %cond.store59 +; AVX1-NEXT: .LBB15_61: # %cond.store59 ; AVX1-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB15_62: # %else60 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: je .LBB15_64 -; AVX1-NEXT: # %bb.63: # %cond.store61 +; AVX1-NEXT: .LBB15_63: # %cond.store61 ; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX1-NEXT: .LBB15_64: # %else62 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpmovmskb %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB15_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB15_3 +; AVX2-NEXT: .LBB15_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB15_5 +; AVX2-NEXT: .LBB15_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB15_7 +; AVX2-NEXT: .LBB15_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB15_9 +; AVX2-NEXT: .LBB15_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB15_11 +; AVX2-NEXT: .LBB15_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB15_13 +; AVX2-NEXT: .LBB15_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB15_15 +; AVX2-NEXT: .LBB15_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB15_17 +; AVX2-NEXT: .LBB15_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB15_19 +; AVX2-NEXT: .LBB15_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB15_21 +; AVX2-NEXT: .LBB15_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB15_23 +; AVX2-NEXT: .LBB15_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB15_25 +; AVX2-NEXT: .LBB15_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB15_27 +; AVX2-NEXT: .LBB15_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB15_29 +; AVX2-NEXT: .LBB15_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: je .LBB15_32 +; AVX2-NEXT: .LBB15_31: # %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX2-NEXT: .LBB15_32: # %else30 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB15_33 +; AVX2-NEXT: # %bb.34: # %else32 +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX2-NEXT: jne .LBB15_35 +; AVX2-NEXT: .LBB15_36: # %else34 +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX2-NEXT: jne .LBB15_37 +; AVX2-NEXT: .LBB15_38: # %else36 +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX2-NEXT: jne .LBB15_39 +; AVX2-NEXT: .LBB15_40: # %else38 +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX2-NEXT: jne .LBB15_41 +; AVX2-NEXT: .LBB15_42: # %else40 +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX2-NEXT: jne .LBB15_43 +; AVX2-NEXT: .LBB15_44: # %else42 +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX2-NEXT: jne .LBB15_45 +; AVX2-NEXT: .LBB15_46: # %else44 +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX2-NEXT: jne .LBB15_47 +; AVX2-NEXT: .LBB15_48: # %else46 +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX2-NEXT: jne .LBB15_49 +; AVX2-NEXT: .LBB15_50: # %else48 +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX2-NEXT: jne .LBB15_51 +; AVX2-NEXT: .LBB15_52: # %else50 +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX2-NEXT: jne .LBB15_53 +; AVX2-NEXT: .LBB15_54: # %else52 +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX2-NEXT: jne .LBB15_55 +; AVX2-NEXT: .LBB15_56: # %else54 +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX2-NEXT: jne .LBB15_57 +; AVX2-NEXT: .LBB15_58: # %else56 +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX2-NEXT: jne .LBB15_59 +; AVX2-NEXT: .LBB15_60: # %else58 +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX2-NEXT: jne .LBB15_61 +; AVX2-NEXT: .LBB15_62: # %else60 +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX2-NEXT: jne .LBB15_63 +; AVX2-NEXT: .LBB15_64: # %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB15_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB15_2: # %else -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB15_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB15_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB15_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB15_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB15_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB15_6: # %else4 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB15_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB15_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB15_8: # %else6 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB15_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB15_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB15_10: # %else8 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB15_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB15_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB15_12: # %else10 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB15_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB15_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB15_14: # %else12 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB15_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB15_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB15_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB15_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB15_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB15_18: # %else16 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB15_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB15_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB15_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB15_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB15_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB15_22: # %else20 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB15_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB15_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB15_24: # %else22 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB15_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB15_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB15_26: # %else24 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB15_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB15_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB15_28: # %else26 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB15_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB15_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB15_30: # %else28 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 -; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB15_34 -; AVX2-NEXT: # %bb.33: # %cond.store31 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB15_31 +; AVX2-NEXT: jmp .LBB15_32 +; AVX2-NEXT: .LBB15_33: # %cond.store31 ; AVX2-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB15_34: # %else32 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX2-NEXT: je .LBB15_36 -; AVX2-NEXT: # %bb.35: # %cond.store33 +; AVX2-NEXT: .LBB15_35: # %cond.store33 ; AVX2-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX2-NEXT: .LBB15_36: # %else34 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX2-NEXT: je .LBB15_38 -; AVX2-NEXT: # %bb.37: # %cond.store35 +; AVX2-NEXT: .LBB15_37: # %cond.store35 ; AVX2-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB15_38: # %else36 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX2-NEXT: je .LBB15_40 -; AVX2-NEXT: # %bb.39: # %cond.store37 +; AVX2-NEXT: .LBB15_39: # %cond.store37 ; AVX2-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX2-NEXT: .LBB15_40: # %else38 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX2-NEXT: je .LBB15_42 -; AVX2-NEXT: # %bb.41: # %cond.store39 +; AVX2-NEXT: .LBB15_41: # %cond.store39 ; AVX2-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB15_42: # %else40 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX2-NEXT: je .LBB15_44 -; AVX2-NEXT: # %bb.43: # %cond.store41 +; AVX2-NEXT: .LBB15_43: # %cond.store41 ; AVX2-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX2-NEXT: .LBB15_44: # %else42 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX2-NEXT: je .LBB15_46 -; AVX2-NEXT: # %bb.45: # %cond.store43 +; AVX2-NEXT: .LBB15_45: # %cond.store43 ; AVX2-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB15_46: # %else44 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX2-NEXT: je .LBB15_48 -; AVX2-NEXT: # %bb.47: # %cond.store45 +; AVX2-NEXT: .LBB15_47: # %cond.store45 ; AVX2-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX2-NEXT: .LBB15_48: # %else46 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX2-NEXT: je .LBB15_50 -; AVX2-NEXT: # %bb.49: # %cond.store47 +; AVX2-NEXT: .LBB15_49: # %cond.store47 ; AVX2-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB15_50: # %else48 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX2-NEXT: je .LBB15_52 -; AVX2-NEXT: # %bb.51: # %cond.store49 +; AVX2-NEXT: .LBB15_51: # %cond.store49 ; AVX2-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX2-NEXT: .LBB15_52: # %else50 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX2-NEXT: je .LBB15_54 -; AVX2-NEXT: # %bb.53: # %cond.store51 +; AVX2-NEXT: .LBB15_53: # %cond.store51 ; AVX2-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB15_54: # %else52 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX2-NEXT: je .LBB15_56 -; AVX2-NEXT: # %bb.55: # %cond.store53 +; AVX2-NEXT: .LBB15_55: # %cond.store53 ; AVX2-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX2-NEXT: .LBB15_56: # %else54 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX2-NEXT: je .LBB15_58 -; AVX2-NEXT: # %bb.57: # %cond.store55 +; AVX2-NEXT: .LBB15_57: # %cond.store55 ; AVX2-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB15_58: # %else56 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX2-NEXT: je .LBB15_60 -; AVX2-NEXT: # %bb.59: # %cond.store57 +; AVX2-NEXT: .LBB15_59: # %cond.store57 ; AVX2-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX2-NEXT: .LBB15_60: # %else58 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX2-NEXT: je .LBB15_62 -; AVX2-NEXT: # %bb.61: # %cond.store59 +; AVX2-NEXT: .LBB15_61: # %cond.store59 ; AVX2-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB15_62: # %else60 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX2-NEXT: je .LBB15_64 -; AVX2-NEXT: # %bb.63: # %cond.store61 +; AVX2-NEXT: .LBB15_63: # %cond.store61 ; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX2-NEXT: .LBB15_64: # %else62 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB15_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB15_3 +; AVX512F-NEXT: .LBB15_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB15_5 +; AVX512F-NEXT: .LBB15_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB15_7 +; AVX512F-NEXT: .LBB15_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB15_9 +; AVX512F-NEXT: .LBB15_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB15_11 +; AVX512F-NEXT: .LBB15_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB15_13 +; AVX512F-NEXT: .LBB15_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB15_15 +; AVX512F-NEXT: .LBB15_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB15_17 +; AVX512F-NEXT: .LBB15_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB15_19 +; AVX512F-NEXT: .LBB15_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB15_21 +; AVX512F-NEXT: .LBB15_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB15_23 +; AVX512F-NEXT: .LBB15_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB15_25 +; AVX512F-NEXT: .LBB15_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB15_27 +; AVX512F-NEXT: .LBB15_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB15_29 +; AVX512F-NEXT: .LBB15_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: je .LBB15_32 +; AVX512F-NEXT: .LBB15_31: # %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512F-NEXT: .LBB15_32: # %else30 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB15_33 +; AVX512F-NEXT: # %bb.34: # %else32 +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512F-NEXT: jne .LBB15_35 +; AVX512F-NEXT: .LBB15_36: # %else34 +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512F-NEXT: jne .LBB15_37 +; AVX512F-NEXT: .LBB15_38: # %else36 +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512F-NEXT: jne .LBB15_39 +; AVX512F-NEXT: .LBB15_40: # %else38 +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512F-NEXT: jne .LBB15_41 +; AVX512F-NEXT: .LBB15_42: # %else40 +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512F-NEXT: jne .LBB15_43 +; AVX512F-NEXT: .LBB15_44: # %else42 +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512F-NEXT: jne .LBB15_45 +; AVX512F-NEXT: .LBB15_46: # %else44 +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512F-NEXT: jne .LBB15_47 +; AVX512F-NEXT: .LBB15_48: # %else46 +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512F-NEXT: jne .LBB15_49 +; AVX512F-NEXT: .LBB15_50: # %else48 +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512F-NEXT: jne .LBB15_51 +; AVX512F-NEXT: .LBB15_52: # %else50 +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512F-NEXT: jne .LBB15_53 +; AVX512F-NEXT: .LBB15_54: # %else52 +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512F-NEXT: jne .LBB15_55 +; AVX512F-NEXT: .LBB15_56: # %else54 +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512F-NEXT: jne .LBB15_57 +; AVX512F-NEXT: .LBB15_58: # %else56 +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512F-NEXT: jne .LBB15_59 +; AVX512F-NEXT: .LBB15_60: # %else58 +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512F-NEXT: jne .LBB15_61 +; AVX512F-NEXT: .LBB15_62: # %else60 +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-NEXT: jne .LBB15_63 +; AVX512F-NEXT: .LBB15_64: # %else62 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB15_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB15_2: # %else -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB15_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB15_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB15_4: # %else2 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB15_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB15_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB15_6: # %else4 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB15_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB15_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB15_8: # %else6 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB15_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB15_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB15_10: # %else8 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB15_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB15_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB15_12: # %else10 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB15_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB15_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB15_14: # %else12 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB15_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB15_16: # %else14 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: .LBB15_15: # %cond.store13 +; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB15_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB15_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB15_18: # %else16 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB15_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB15_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB15_20: # %else18 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB15_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB15_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB15_22: # %else20 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB15_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB15_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB15_24: # %else22 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB15_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB15_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB15_26: # %else24 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB15_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB15_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB15_28: # %else26 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB15_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB15_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB15_30: # %else28 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 -; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB15_34 -; AVX512F-NEXT: # %bb.33: # %cond.store31 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB15_31 +; AVX512F-NEXT: jmp .LBB15_32 +; AVX512F-NEXT: .LBB15_33: # %cond.store31 ; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB15_34: # %else32 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX512F-NEXT: je .LBB15_36 -; AVX512F-NEXT: # %bb.35: # %cond.store33 +; AVX512F-NEXT: .LBB15_35: # %cond.store33 ; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX512F-NEXT: .LBB15_36: # %else34 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX512F-NEXT: je .LBB15_38 -; AVX512F-NEXT: # %bb.37: # %cond.store35 +; AVX512F-NEXT: .LBB15_37: # %cond.store35 ; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB15_38: # %else36 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX512F-NEXT: je .LBB15_40 -; AVX512F-NEXT: # %bb.39: # %cond.store37 +; AVX512F-NEXT: .LBB15_39: # %cond.store37 ; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX512F-NEXT: .LBB15_40: # %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX512F-NEXT: je .LBB15_42 -; AVX512F-NEXT: # %bb.41: # %cond.store39 +; AVX512F-NEXT: .LBB15_41: # %cond.store39 ; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB15_42: # %else40 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX512F-NEXT: je .LBB15_44 -; AVX512F-NEXT: # %bb.43: # %cond.store41 +; AVX512F-NEXT: .LBB15_43: # %cond.store41 ; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX512F-NEXT: .LBB15_44: # %else42 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX512F-NEXT: je .LBB15_46 -; AVX512F-NEXT: # %bb.45: # %cond.store43 +; AVX512F-NEXT: .LBB15_45: # %cond.store43 ; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB15_46: # %else44 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX512F-NEXT: je .LBB15_48 -; AVX512F-NEXT: # %bb.47: # %cond.store45 +; AVX512F-NEXT: .LBB15_47: # %cond.store45 ; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX512F-NEXT: .LBB15_48: # %else46 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX512F-NEXT: je .LBB15_50 -; AVX512F-NEXT: # %bb.49: # %cond.store47 +; AVX512F-NEXT: .LBB15_49: # %cond.store47 ; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB15_50: # %else48 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX512F-NEXT: je .LBB15_52 -; AVX512F-NEXT: # %bb.51: # %cond.store49 +; AVX512F-NEXT: .LBB15_51: # %cond.store49 ; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX512F-NEXT: .LBB15_52: # %else50 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX512F-NEXT: je .LBB15_54 -; AVX512F-NEXT: # %bb.53: # %cond.store51 +; AVX512F-NEXT: .LBB15_53: # %cond.store51 ; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB15_54: # %else52 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX512F-NEXT: je .LBB15_56 -; AVX512F-NEXT: # %bb.55: # %cond.store53 +; AVX512F-NEXT: .LBB15_55: # %cond.store53 ; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX512F-NEXT: .LBB15_56: # %else54 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX512F-NEXT: je .LBB15_58 -; AVX512F-NEXT: # %bb.57: # %cond.store55 +; AVX512F-NEXT: .LBB15_57: # %cond.store55 ; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB15_58: # %else56 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX512F-NEXT: je .LBB15_60 -; AVX512F-NEXT: # %bb.59: # %cond.store57 +; AVX512F-NEXT: .LBB15_59: # %cond.store57 ; AVX512F-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX512F-NEXT: .LBB15_60: # %else58 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX512F-NEXT: je .LBB15_62 -; AVX512F-NEXT: # %bb.61: # %cond.store59 +; AVX512F-NEXT: .LBB15_61: # %cond.store59 ; AVX512F-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB15_62: # %else60 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX512F-NEXT: je .LBB15_64 -; AVX512F-NEXT: # %bb.63: # %cond.store61 +; AVX512F-NEXT: .LBB15_63: # %cond.store61 ; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX512F-NEXT: .LBB15_64: # %else62 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6208,753 +5517,606 @@ ; SSE2-LABEL: truncstore_v16i16_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: pmovmskb %xmm3, %eax +; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB16_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB16_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB16_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB16_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB16_3 ; SSE2-NEXT: .LBB16_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB16_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB16_5 ; SSE2-NEXT: .LBB16_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB16_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB16_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB16_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB16_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB16_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB16_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB16_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB16_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB16_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB16_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB16_26: # %else24 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB16_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: je .LBB16_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 +; SSE2-NEXT: jne .LBB16_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB16_31 +; SSE2-NEXT: .LBB16_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB16_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB16_4 +; SSE2-NEXT: .LBB16_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB16_6 +; SSE2-NEXT: .LBB16_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB16_7 +; SSE2-NEXT: jmp .LBB16_8 +; SSE2-NEXT: .LBB16_29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) -; SSE2-NEXT: .LBB16_30: # %else28 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB16_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB16_31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) -; SSE2-NEXT: .LBB16_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; SSE4-NEXT: pshufb %xmm4, %xmm1 ; SSE4-NEXT: pshufb %xmm4, %xmm0 ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: notb %al +; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE4-NEXT: pmovmskb %xmm3, %eax +; SSE4-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB16_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB16_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB16_3 +; SSE4-NEXT: .LBB16_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB16_5 +; SSE4-NEXT: .LBB16_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB16_7 +; SSE4-NEXT: .LBB16_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB16_9 +; SSE4-NEXT: .LBB16_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB16_11 +; SSE4-NEXT: .LBB16_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB16_13 +; SSE4-NEXT: .LBB16_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB16_15 +; SSE4-NEXT: .LBB16_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB16_17 +; SSE4-NEXT: .LBB16_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB16_19 +; SSE4-NEXT: .LBB16_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB16_21 +; SSE4-NEXT: .LBB16_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB16_23 +; SSE4-NEXT: .LBB16_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB16_25 +; SSE4-NEXT: .LBB16_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB16_27 +; SSE4-NEXT: .LBB16_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB16_29 +; SSE4-NEXT: .LBB16_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB16_31 +; SSE4-NEXT: .LBB16_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB16_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB16_2: # %else -; SSE4-NEXT: pextrb $1, %xmm3, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB16_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB16_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB16_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB16_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB16_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB16_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB16_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB16_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB16_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB16_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB16_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB16_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB16_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB16_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB16_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB16_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB16_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB16_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB16_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB16_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB16_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB16_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB16_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB16_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB16_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB16_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB16_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB16_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB16_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB16_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB16_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB16_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB16_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB16_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB16_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB16_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB16_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB16_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB16_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB16_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB16_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB16_30: # %else28 -; SSE4-NEXT: pextrb $15, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB16_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB16_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB16_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: notb %al +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB16_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB16_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB16_3 +; AVX1-NEXT: .LBB16_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB16_5 +; AVX1-NEXT: .LBB16_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB16_7 +; AVX1-NEXT: .LBB16_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB16_9 +; AVX1-NEXT: .LBB16_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB16_11 +; AVX1-NEXT: .LBB16_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB16_13 +; AVX1-NEXT: .LBB16_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB16_15 +; AVX1-NEXT: .LBB16_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB16_17 +; AVX1-NEXT: .LBB16_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB16_19 +; AVX1-NEXT: .LBB16_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB16_21 +; AVX1-NEXT: .LBB16_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB16_23 +; AVX1-NEXT: .LBB16_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB16_25 +; AVX1-NEXT: .LBB16_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB16_27 +; AVX1-NEXT: .LBB16_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB16_29 +; AVX1-NEXT: .LBB16_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB16_31 +; AVX1-NEXT: .LBB16_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB16_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB16_2: # %else -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB16_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB16_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB16_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB16_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB16_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB16_6: # %else4 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB16_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB16_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB16_8: # %else6 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB16_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB16_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB16_10: # %else8 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB16_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB16_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB16_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB16_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB16_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB16_14: # %else12 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB16_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB16_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB16_16: # %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB16_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB16_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB16_18: # %else16 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB16_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB16_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB16_20: # %else18 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB16_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB16_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB16_22: # %else20 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB16_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB16_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB16_24: # %else22 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB16_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB16_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB16_26: # %else24 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB16_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB16_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB16_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB16_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB16_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB16_30: # %else28 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB16_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB16_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB16_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB16_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB16_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB16_3 +; AVX2-NEXT: .LBB16_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB16_5 +; AVX2-NEXT: .LBB16_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB16_7 +; AVX2-NEXT: .LBB16_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB16_9 +; AVX2-NEXT: .LBB16_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB16_11 +; AVX2-NEXT: .LBB16_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB16_13 +; AVX2-NEXT: .LBB16_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB16_15 +; AVX2-NEXT: .LBB16_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB16_17 +; AVX2-NEXT: .LBB16_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB16_19 +; AVX2-NEXT: .LBB16_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB16_21 +; AVX2-NEXT: .LBB16_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB16_23 +; AVX2-NEXT: .LBB16_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB16_25 +; AVX2-NEXT: .LBB16_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB16_27 +; AVX2-NEXT: .LBB16_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB16_29 +; AVX2-NEXT: .LBB16_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB16_31 +; AVX2-NEXT: .LBB16_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB16_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB16_2: # %else -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB16_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB16_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB16_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB16_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB16_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB16_6: # %else4 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB16_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 -; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB16_8: # %else6 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je .LBB16_8 +; AVX2-NEXT: .LBB16_7: # %cond.store5 +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB16_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB16_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB16_10: # %else8 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB16_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB16_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB16_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB16_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB16_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB16_14: # %else12 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB16_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB16_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB16_16: # %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB16_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB16_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB16_18: # %else16 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB16_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB16_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB16_20: # %else18 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB16_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB16_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB16_22: # %else20 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB16_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB16_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB16_24: # %else22 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB16_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB16_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB16_26: # %else24 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB16_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB16_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB16_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB16_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB16_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB16_30: # %else28 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB16_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB16_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB16_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v16i16_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %xmm1, %eax +; AVX512F-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB16_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB16_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB16_3 +; AVX512F-NEXT: .LBB16_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB16_5 +; AVX512F-NEXT: .LBB16_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB16_7 +; AVX512F-NEXT: .LBB16_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB16_9 +; AVX512F-NEXT: .LBB16_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB16_11 +; AVX512F-NEXT: .LBB16_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB16_13 +; AVX512F-NEXT: .LBB16_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB16_15 +; AVX512F-NEXT: .LBB16_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB16_17 +; AVX512F-NEXT: .LBB16_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB16_19 +; AVX512F-NEXT: .LBB16_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB16_21 +; AVX512F-NEXT: .LBB16_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB16_23 +; AVX512F-NEXT: .LBB16_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB16_25 +; AVX512F-NEXT: .LBB16_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB16_27 +; AVX512F-NEXT: .LBB16_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB16_29 +; AVX512F-NEXT: .LBB16_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB16_31 +; AVX512F-NEXT: .LBB16_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB16_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB16_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB16_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB16_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB16_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB16_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB16_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB16_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB16_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB16_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB16_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB16_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB16_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB16_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB16_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB16_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB16_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB16_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB16_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB16_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB16_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB16_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB16_16: # %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB16_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB16_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB16_18: # %else16 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB16_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB16_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB16_20: # %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB16_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB16_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB16_22: # %else20 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB16_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB16_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB16_24: # %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB16_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB16_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB16_26: # %else24 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB16_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB16_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB16_28: # %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB16_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB16_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB16_30: # %else28 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB16_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB16_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB16_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6986,307 +6148,269 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB17_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB17_2: # %else -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB17_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB17_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: packsswb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB17_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB17_3 +; SSE2-NEXT: .LBB17_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB17_5 ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB17_7 ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB17_9 ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB17_11 ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB17_15 +; SSE2-NEXT: .LBB17_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB17_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB17_4 +; SSE2-NEXT: .LBB17_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB17_6 +; SSE2-NEXT: .LBB17_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB17_8 +; SSE2-NEXT: .LBB17_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB17_10 +; SSE2-NEXT: .LBB17_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB17_12 +; SSE2-NEXT: .LBB17_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB17_14 +; SSE2-NEXT: .LBB17_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB17_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB17_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i16_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: packsswb %xmm0, %xmm1 +; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB17_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB17_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB17_3 +; SSE4-NEXT: .LBB17_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB17_5 +; SSE4-NEXT: .LBB17_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB17_7 +; SSE4-NEXT: .LBB17_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB17_9 +; SSE4-NEXT: .LBB17_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB17_11 +; SSE4-NEXT: .LBB17_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB17_13 +; SSE4-NEXT: .LBB17_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB17_15 +; SSE4-NEXT: .LBB17_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB17_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB17_2: # %else -; SSE4-NEXT: pextrb $2, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB17_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB17_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB17_4: # %else2 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB17_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB17_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB17_6: # %else4 -; SSE4-NEXT: pextrb $6, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB17_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB17_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB17_8: # %else6 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB17_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB17_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB17_10: # %else8 -; SSE4-NEXT: pextrb $10, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB17_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB17_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB17_12: # %else10 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB17_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB17_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB17_14: # %else12 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB17_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB17_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v8i16_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpmovmskb %xmm1, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB17_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB17_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB17_3 +; AVX-NEXT: .LBB17_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB17_5 +; AVX-NEXT: .LBB17_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB17_7 +; AVX-NEXT: .LBB17_8: # %else6 +; AVX-NEXT: testb $16, %al +; AVX-NEXT: jne .LBB17_9 +; AVX-NEXT: .LBB17_10: # %else8 +; AVX-NEXT: testb $32, %al +; AVX-NEXT: jne .LBB17_11 +; AVX-NEXT: .LBB17_12: # %else10 +; AVX-NEXT: testb $64, %al +; AVX-NEXT: jne .LBB17_13 +; AVX-NEXT: .LBB17_14: # %else12 +; AVX-NEXT: testb $-128, %al +; AVX-NEXT: jne .LBB17_15 +; AVX-NEXT: .LBB17_16: # %else14 +; AVX-NEXT: retq +; AVX-NEXT: .LBB17_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB17_2: # %else -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB17_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB17_3: # %cond.store1 ; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB17_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB17_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB17_5: # %cond.store3 ; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB17_6: # %else4 -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB17_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB17_7: # %cond.store5 ; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB17_8: # %else6 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $16, %al ; AVX-NEXT: je .LBB17_10 -; AVX-NEXT: # %bb.9: # %cond.store7 +; AVX-NEXT: .LBB17_9: # %cond.store7 ; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB17_10: # %else8 -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $32, %al ; AVX-NEXT: je .LBB17_12 -; AVX-NEXT: # %bb.11: # %cond.store9 +; AVX-NEXT: .LBB17_11: # %cond.store9 ; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX-NEXT: .LBB17_12: # %else10 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $64, %al ; AVX-NEXT: je .LBB17_14 -; AVX-NEXT: # %bb.13: # %cond.store11 +; AVX-NEXT: .LBB17_13: # %cond.store11 ; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB17_14: # %else12 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $-128, %al ; AVX-NEXT: je .LBB17_16 -; AVX-NEXT: # %bb.15: # %cond.store13 +; AVX-NEXT: .LBB17_15: # %cond.store13 ; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v8i16_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB17_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB17_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB17_3 +; AVX512F-NEXT: .LBB17_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB17_5 +; AVX512F-NEXT: .LBB17_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB17_7 +; AVX512F-NEXT: .LBB17_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB17_9 +; AVX512F-NEXT: .LBB17_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB17_11 +; AVX512F-NEXT: .LBB17_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB17_13 +; AVX512F-NEXT: .LBB17_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB17_15 +; AVX512F-NEXT: .LBB17_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB17_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB17_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB17_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB17_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB17_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB17_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB17_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB17_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB17_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB17_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB17_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB17_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB17_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB17_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB17_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB17_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB17_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB17_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB17_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB17_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB17_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB17_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/masked_store_trunc_ssat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ llvm/trunk/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -11,140 +11,134 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: packssdw %xmm0, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483647,2147483647] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pxor %xmm11, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pand %xmm12, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm13 ; SSE2-NEXT: pand %xmm13, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: pandn %xmm9, %xmm13 ; SSE2-NEXT: por %xmm2, %xmm13 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067968,18446744071562067968] ; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [18446744069414584320,18446744069414584320] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm14, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm14, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm12, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pand %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm14, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSE2-NEXT: pxor %xmm13, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm14, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] +; SSE2-NEXT: pxor %xmm13, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm12, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: movd %xmm8, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: psrlq $16, %xmm9 -; SSE2-NEXT: movd %xmm9, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm6, 4(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm4, 4(%rdi) ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: pandn %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pextrw $4, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pandn %xmm9, %xmm0 +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] @@ -152,58 +146,49 @@ ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm13 -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 ; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm2[0,2] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB0_9 +; SSE2-NEXT: # %bb.10: # %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB0_11 +; SSE2-NEXT: .LBB0_12: # %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB0_13 +; SSE2-NEXT: .LBB0_14: # %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB0_15 +; SSE2-NEXT: .LBB0_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB0_9: # %cond.store7 ; SSE2-NEXT: movss %xmm13, 16(%rdi) -; SSE2-NEXT: .LBB0_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB0_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 +; SSE2-NEXT: .LBB0_11: # %cond.store9 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 20(%rdi) -; SSE2-NEXT: .LBB0_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,0,1] -; SSE2-NEXT: movd %xmm1, 24(%rdi) -; SSE2-NEXT: .LBB0_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB0_13: # %cond.store11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 24(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB0_15: # %cond.store13 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 28(%rdi) -; SSE2-NEXT: .LBB0_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 @@ -220,85 +205,87 @@ ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE4-NEXT: movapd %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm2 ; SSE4-NEXT: movapd %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm2, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm7 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE4-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] +; SSE4-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] ; SSE4-NEXT: movapd %xmm6, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm2, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 ; SSE4-NEXT: movapd %xmm10, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm7, (%rdi) -; SSE4-NEXT: .LBB0_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $1, %xmm7, 4(%rdi) -; SSE4-NEXT: .LBB0_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: extractps $2, %xmm7, 8(%rdi) +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB0_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB0_3 +; SSE4-NEXT: .LBB0_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB0_5 ; SSE4-NEXT: .LBB0_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB0_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB0_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm7, 12(%rdi) ; SSE4-NEXT: .LBB0_8: # %else6 -; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: movss %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB0_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: extractps $1, %xmm2, 20(%rdi) +; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB0_9 +; SSE4-NEXT: # %bb.10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB0_11 ; SSE4-NEXT: .LBB0_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: extractps $2, %xmm2, 24(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB0_13 ; SSE4-NEXT: .LBB0_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: extractps $3, %xmm2, 28(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB0_15 ; SSE4-NEXT: .LBB0_16: # %else14 ; SSE4-NEXT: retq +; SSE4-NEXT: .LBB0_1: # %cond.store +; SSE4-NEXT: movss %xmm7, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB0_4 +; SSE4-NEXT: .LBB0_3: # %cond.store1 +; SSE4-NEXT: extractps $1, %xmm7, 4(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB0_6 +; SSE4-NEXT: .LBB0_5: # %cond.store3 +; SSE4-NEXT: extractps $2, %xmm7, 8(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB0_7 +; SSE4-NEXT: jmp .LBB0_8 +; SSE4-NEXT: .LBB0_9: # %cond.store7 +; SSE4-NEXT: movss %xmm1, 16(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je .LBB0_12 +; SSE4-NEXT: .LBB0_11: # %cond.store9 +; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: je .LBB0_14 +; SSE4-NEXT: .LBB0_13: # %cond.store11 +; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: je .LBB0_16 +; SSE4-NEXT: .LBB0_15: # %cond.store13 +; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi) +; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i32: ; AVX1: # %bb.0: @@ -403,202 +390,195 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: packssdw %xmm0, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [32767,32767] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pxor %xmm11, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm14 -; SSE2-NEXT: por %xmm2, %xmm14 +; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm13 +; SSE2-NEXT: por %xmm2, %xmm13 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm13, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: pand %xmm2, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm14 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm14, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm13 +; SSE2-NEXT: pandn %xmm9, %xmm1 +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB1_2: # %else -; SSE2-NEXT: psrlq $16, %xmm9 -; SSE2-NEXT: movd %xmm9, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB1_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB1_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB1_3 +; SSE2-NEXT: .LBB1_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB1_5 ; SSE2-NEXT: .LBB1_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB1_7 ; SSE2-NEXT: .LBB1_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB1_9 ; SSE2-NEXT: .LBB1_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB1_11 ; SSE2-NEXT: .LBB1_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB1_13 ; SSE2-NEXT: .LBB1_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB1_15 +; SSE2-NEXT: .LBB1_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB1_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB1_4 +; SSE2-NEXT: .LBB1_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB1_6 +; SSE2-NEXT: .LBB1_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB1_8 +; SSE2-NEXT: .LBB1_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB1_10 +; SSE2-NEXT: .LBB1_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB1_12 +; SSE2-NEXT: .LBB1_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB1_14 +; SSE2-NEXT: .LBB1_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB1_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB1_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB1_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [32767,32767] ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 @@ -634,250 +614,238 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 ; SSE4-NEXT: packssdw %xmm3, %xmm1 ; SSE4-NEXT: packssdw %xmm1, %xmm7 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB1_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB1_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB1_3 +; SSE4-NEXT: .LBB1_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB1_5 +; SSE4-NEXT: .LBB1_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB1_7 +; SSE4-NEXT: .LBB1_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB1_9 +; SSE4-NEXT: .LBB1_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB1_11 +; SSE4-NEXT: .LBB1_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB1_13 +; SSE4-NEXT: .LBB1_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB1_15 +; SSE4-NEXT: .LBB1_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB1_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm7, (%rdi) -; SSE4-NEXT: .LBB1_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB1_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB1_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm7, 2(%rdi) -; SSE4-NEXT: .LBB1_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB1_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB1_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm7, 4(%rdi) -; SSE4-NEXT: .LBB1_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB1_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB1_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm7, 6(%rdi) -; SSE4-NEXT: .LBB1_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB1_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB1_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm7, 8(%rdi) -; SSE4-NEXT: .LBB1_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB1_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB1_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm7, 10(%rdi) -; SSE4-NEXT: .LBB1_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB1_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB1_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm7, 12(%rdi) -; SSE4-NEXT: .LBB1_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB1_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB1_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm7, 14(%rdi) -; SSE4-NEXT: .LBB1_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32767,32767] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm7, %xmm12 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm7, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm13 -; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm14 -; AVX1-NEXT: vblendvpd %xmm12, %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm11, %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm14, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm10 +; AVX1-NEXT: vblendvpd %xmm5, %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm11 +; AVX1-NEXT: vblendvpd %xmm9, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm11, %xmm5, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm10, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB1_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB1_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB1_3 +; AVX1-NEXT: .LBB1_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB1_5 +; AVX1-NEXT: .LBB1_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB1_7 +; AVX1-NEXT: .LBB1_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB1_9 +; AVX1-NEXT: .LBB1_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB1_11 +; AVX1-NEXT: .LBB1_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB1_13 +; AVX1-NEXT: .LBB1_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB1_15 +; AVX1-NEXT: .LBB1_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB1_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB1_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB1_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB1_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB1_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB1_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB1_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB1_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB1_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB1_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB1_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB1_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB1_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB1_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB1_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB1_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB1_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB1_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB1_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB1_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB1_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [32767,32767,32767,32767] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm1, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm0, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [32767,32767,32767,32767] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB1_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB1_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB1_3 +; AVX2-NEXT: .LBB1_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB1_5 +; AVX2-NEXT: .LBB1_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB1_7 +; AVX2-NEXT: .LBB1_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB1_9 +; AVX2-NEXT: .LBB1_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB1_11 +; AVX2-NEXT: .LBB1_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB1_13 +; AVX2-NEXT: .LBB1_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB1_15 +; AVX2-NEXT: .LBB1_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB1_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB1_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB1_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB1_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB1_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB1_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB1_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB1_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB1_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB1_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB1_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB1_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB1_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB1_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB1_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB1_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB1_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB1_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB1_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB1_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB1_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -888,66 +856,61 @@ ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB1_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB1_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB1_3 +; AVX512F-NEXT: .LBB1_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB1_5 +; AVX512F-NEXT: .LBB1_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB1_7 +; AVX512F-NEXT: .LBB1_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB1_9 +; AVX512F-NEXT: .LBB1_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB1_11 +; AVX512F-NEXT: .LBB1_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB1_13 +; AVX512F-NEXT: .LBB1_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB1_15 +; AVX512F-NEXT: .LBB1_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB1_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB1_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB1_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB1_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB1_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB1_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB1_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB1_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB1_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB1_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB1_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB1_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB1_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB1_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB1_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB1_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB1_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB1_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB1_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB1_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB1_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB1_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB1_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -982,202 +945,195 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: packssdw %xmm0, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pxor %xmm11, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm14 -; SSE2-NEXT: por %xmm2, %xmm14 +; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm13 +; SSE2-NEXT: por %xmm2, %xmm13 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm13, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: pand %xmm2, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm14 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm14, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm13 +; SSE2-NEXT: pandn %xmm9, %xmm1 +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB2_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB2_2: # %else -; SSE2-NEXT: psrlq $16, %xmm9 -; SSE2-NEXT: movd %xmm9, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB2_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB2_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB2_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB2_3 +; SSE2-NEXT: .LBB2_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB2_5 ; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB2_7 ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB2_9 ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB2_11 ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB2_15 +; SSE2-NEXT: .LBB2_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB2_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB2_4 +; SSE2-NEXT: .LBB2_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB2_6 +; SSE2-NEXT: .LBB2_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB2_8 +; SSE2-NEXT: .LBB2_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB2_10 +; SSE2-NEXT: .LBB2_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB2_12 +; SSE2-NEXT: .LBB2_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB2_14 +; SSE2-NEXT: .LBB2_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB2_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB2_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [127,127] ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 @@ -1213,250 +1169,238 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 ; SSE4-NEXT: packssdw %xmm3, %xmm1 ; SSE4-NEXT: packssdw %xmm1, %xmm7 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB2_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB2_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB2_3 +; SSE4-NEXT: .LBB2_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB2_5 +; SSE4-NEXT: .LBB2_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB2_7 +; SSE4-NEXT: .LBB2_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB2_9 +; SSE4-NEXT: .LBB2_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB2_11 +; SSE4-NEXT: .LBB2_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB2_13 +; SSE4-NEXT: .LBB2_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB2_15 +; SSE4-NEXT: .LBB2_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB2_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm7, (%rdi) -; SSE4-NEXT: .LBB2_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB2_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB2_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm7, 1(%rdi) -; SSE4-NEXT: .LBB2_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB2_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB2_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm7, 2(%rdi) -; SSE4-NEXT: .LBB2_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB2_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB2_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm7, 3(%rdi) -; SSE4-NEXT: .LBB2_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB2_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB2_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm7, 4(%rdi) -; SSE4-NEXT: .LBB2_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB2_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB2_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm7, 5(%rdi) -; SSE4-NEXT: .LBB2_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB2_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB2_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm7, 6(%rdi) -; SSE4-NEXT: .LBB2_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB2_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB2_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm7, 7(%rdi) -; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm7, %xmm12 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm7, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm13 -; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm14 -; AVX1-NEXT: vblendvpd %xmm12, %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm11, %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm14, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm10 +; AVX1-NEXT: vblendvpd %xmm5, %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm11 +; AVX1-NEXT: vblendvpd %xmm9, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm11, %xmm5, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm10, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB2_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB2_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB2_3 +; AVX1-NEXT: .LBB2_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB2_5 +; AVX1-NEXT: .LBB2_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB2_7 +; AVX1-NEXT: .LBB2_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB2_9 +; AVX1-NEXT: .LBB2_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB2_11 +; AVX1-NEXT: .LBB2_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB2_13 +; AVX1-NEXT: .LBB2_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB2_15 +; AVX1-NEXT: .LBB2_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB2_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB2_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB2_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB2_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB2_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB2_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB2_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB2_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB2_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB2_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB2_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB2_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB2_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB2_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB2_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB2_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB2_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB2_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB2_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB2_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm6, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm1, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm0, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB2_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB2_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB2_3 +; AVX2-NEXT: .LBB2_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB2_5 +; AVX2-NEXT: .LBB2_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB2_7 +; AVX2-NEXT: .LBB2_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB2_9 +; AVX2-NEXT: .LBB2_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB2_11 +; AVX2-NEXT: .LBB2_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB2_13 +; AVX2-NEXT: .LBB2_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB2_15 +; AVX2-NEXT: .LBB2_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB2_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB2_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB2_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB2_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB2_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB2_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB2_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB2_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB2_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB2_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB2_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB2_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB2_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB2_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB2_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB2_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB2_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB2_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB2_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB2_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1469,66 +1413,61 @@ ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB2_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB2_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB2_3 +; AVX512F-NEXT: .LBB2_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB2_5 +; AVX512F-NEXT: .LBB2_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB2_7 +; AVX512F-NEXT: .LBB2_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB2_9 +; AVX512F-NEXT: .LBB2_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB2_11 +; AVX512F-NEXT: .LBB2_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB2_13 +; AVX512F-NEXT: .LBB2_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB2_15 +; AVX512F-NEXT: .LBB2_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB2_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB2_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB2_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB2_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB2_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB2_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB2_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB2_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB2_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB2_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB2_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB2_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB2_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB2_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB2_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB2_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB2_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB2_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB2_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB2_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB2_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB2_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1563,30 +1502,29 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -1622,50 +1560,45 @@ ; SSE2-NEXT: pandn %xmm8, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB3_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2-NEXT: movmskps %xmm9, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB3_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB3_3 +; SSE2-NEXT: .LBB3_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB3_5 +; SSE2-NEXT: .LBB3_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB3_7 +; SSE2-NEXT: .LBB3_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB3_1: # %cond.store ; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: .LBB3_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB3_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm3, 4(%rdi) -; SSE2-NEXT: .LBB3_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB3_3: # %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm1, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 +; SSE2-NEXT: .LBB3_5: # %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm1, 8(%rdi) -; SSE2-NEXT: .LBB3_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB3_7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) -; SSE2-NEXT: .LBB3_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 @@ -1683,34 +1616,36 @@ ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB3_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE4-NEXT: movmskps %xmm4, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB3_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB3_3 +; SSE4-NEXT: .LBB3_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB3_5 +; SSE4-NEXT: .LBB3_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB3_7 +; SSE4-NEXT: .LBB3_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB3_1: # %cond.store ; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: .LBB3_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB3_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB3_3: # %cond.store1 ; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) -; SSE4-NEXT: .LBB3_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB3_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB3_5: # %cond.store3 ; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) -; SSE4-NEXT: .LBB3_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB3_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB3_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) -; SSE4-NEXT: .LBB3_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i32: @@ -1806,30 +1741,29 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -1865,51 +1799,46 @@ ; SSE2-NEXT: pandn %xmm8, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB4_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2-NEXT: movmskps %xmm9, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB4_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB4_3 ; SSE2-NEXT: .LBB4_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB4_5 ; SSE2-NEXT: .LBB4_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB4_7 +; SSE2-NEXT: .LBB4_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB4_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB4_4 +; SSE2-NEXT: .LBB4_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB4_6 +; SSE2-NEXT: .LBB4_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB4_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB4_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [32767,32767] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 @@ -1927,42 +1856,41 @@ ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE4-NEXT: packssdw %xmm3, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB4_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE4-NEXT: movmskps %xmm4, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB4_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB4_3 +; SSE4-NEXT: .LBB4_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB4_5 +; SSE4-NEXT: .LBB4_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB4_7 +; SSE4-NEXT: .LBB4_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB4_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB4_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB4_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB4_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm1, 2(%rdi) -; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB4_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB4_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm1, 4(%rdi) -; SSE4-NEXT: .LBB4_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB4_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB4_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm1, 6(%rdi) -; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -1975,43 +1903,43 @@ ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB4_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB4_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB4_3 +; AVX1-NEXT: .LBB4_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB4_5 +; AVX1-NEXT: .LBB4_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB4_7 +; AVX1-NEXT: .LBB4_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB4_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB4_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB4_3: # %cond.store1 ; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB4_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB4_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB4_5: # %cond.store3 ; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB4_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB4_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB4_7: # %cond.store5 ; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB4_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i64_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [32767,32767,32767,32767] ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 @@ -2020,34 +1948,37 @@ ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB4_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB4_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB4_3 +; AVX2-NEXT: .LBB4_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB4_5 +; AVX2-NEXT: .LBB4_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB4_7 +; AVX2-NEXT: .LBB4_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB4_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB4_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB4_3: # %cond.store1 ; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB4_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB4_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB4_5: # %cond.store3 ; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB4_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB4_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB4_7: # %cond.store5 ; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB4_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2056,41 +1987,40 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB4_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB4_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB4_3 +; AVX512F-NEXT: .LBB4_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB4_5 +; AVX512F-NEXT: .LBB4_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB4_7 +; AVX512F-NEXT: .LBB4_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB4_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB4_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB4_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB4_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB4_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB4_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB4_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB4_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB4_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB4_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2132,30 +2062,29 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -2191,51 +2120,46 @@ ; SSE2-NEXT: pandn %xmm8, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB5_2: # %else -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2-NEXT: movmskps %xmm9, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB5_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB5_3 ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB5_5 ; SSE2-NEXT: .LBB5_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB5_7 +; SSE2-NEXT: .LBB5_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB5_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB5_4 +; SSE2-NEXT: .LBB5_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB5_6 +; SSE2-NEXT: .LBB5_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB5_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB5_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 @@ -2253,42 +2177,41 @@ ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE4-NEXT: packssdw %xmm3, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB5_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE4-NEXT: movmskps %xmm4, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB5_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB5_3 +; SSE4-NEXT: .LBB5_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB5_5 +; SSE4-NEXT: .LBB5_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB5_7 +; SSE4-NEXT: .LBB5_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB5_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB5_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB5_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB5_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm1, 1(%rdi) -; SSE4-NEXT: .LBB5_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB5_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB5_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm1, 2(%rdi) -; SSE4-NEXT: .LBB5_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB5_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB5_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm1, 3(%rdi) -; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2301,43 +2224,43 @@ ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB5_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB5_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB5_3 +; AVX1-NEXT: .LBB5_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB5_5 +; AVX1-NEXT: .LBB5_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB5_7 +; AVX1-NEXT: .LBB5_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB5_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB5_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB5_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB5_3: # %cond.store1 ; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB5_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB5_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB5_5: # %cond.store3 ; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB5_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB5_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB5_7: # %cond.store5 ; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB5_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i64_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [127,127,127,127] ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 @@ -2346,34 +2269,37 @@ ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB5_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB5_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB5_3 +; AVX2-NEXT: .LBB5_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB5_5 +; AVX2-NEXT: .LBB5_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB5_7 +; AVX2-NEXT: .LBB5_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB5_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB5_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB5_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB5_3: # %cond.store1 ; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB5_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB5_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB5_5: # %cond.store3 ; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB5_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB5_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB5_7: # %cond.store5 ; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB5_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2382,41 +2308,40 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] -; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB5_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB5_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB5_3 +; AVX512F-NEXT: .LBB5_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB5_5 +; AVX512F-NEXT: .LBB5_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB5_7 +; AVX512F-NEXT: .LBB5_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB5_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB5_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB5_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB5_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB5_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB5_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB5_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB5_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB5_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB5_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2459,82 +2384,83 @@ ; SSE2-LABEL: truncstore_v2i64_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB6_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: jne .LBB6_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB6_3 +; SSE2-NEXT: .LBB6_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB6_1: # %cond.store ; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: .LBB6_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB6_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB6_3: # %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 4(%rdi) -; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm3, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm3 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm3, %xmm0 +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB6_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 +; SSE4-NEXT: movmskpd %xmm3, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB6_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB6_3 +; SSE4-NEXT: .LBB6_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB6_1: # %cond.store ; SSE4-NEXT: movss %xmm2, (%rdi) -; SSE4-NEXT: .LBB6_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB6_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB6_3: # %cond.store1 ; SSE4-NEXT: extractps $2, %xmm2, 4(%rdi) -; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v2i64_v2i32: @@ -2624,109 +2550,111 @@ ; SSE2-LABEL: truncstore_v2i64_v2i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB7_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB7_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB7_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB7_3 +; SSE2-NEXT: .LBB7_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB7_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB7_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB7_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767] -; SSE4-NEXT: movdqa %xmm3, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm3 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] -; SSE4-NEXT: movapd %xmm3, %xmm0 +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB7_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 +; SSE4-NEXT: movmskpd %xmm3, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB7_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB7_3 +; SSE4-NEXT: .LBB7_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB7_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm2, (%rdi) -; SSE4-NEXT: .LBB7_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB7_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB7_3: # %cond.store1 ; SSE4-NEXT: pextrw $4, %xmm2, 2(%rdi) -; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB7_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB7_3 +; AVX-NEXT: .LBB7_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB7_1: # %cond.store ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB7_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB7_3: # %cond.store1 ; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: @@ -2734,24 +2662,25 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767] -; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB7_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB7_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB7_3 +; AVX512F-NEXT: .LBB7_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB7_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB7_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB7_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB7_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2793,109 +2722,111 @@ ; SSE2-LABEL: truncstore_v2i64_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB8_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB8_2: # %else -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB8_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB8_3 +; SSE2-NEXT: .LBB8_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB8_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB8_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB8_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [127,127] -; SSE4-NEXT: movdqa %xmm3, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm3 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [127,127] +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] -; SSE4-NEXT: movapd %xmm3, %xmm0 +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB8_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 +; SSE4-NEXT: movmskpd %xmm3, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB8_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB8_3 +; SSE4-NEXT: .LBB8_4: # %else2 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB8_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm2, (%rdi) -; SSE4-NEXT: .LBB8_2: # %else -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB8_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB8_3: # %cond.store1 ; SSE4-NEXT: pextrb $8, %xmm2, 1(%rdi) -; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB8_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB8_3 +; AVX-NEXT: .LBB8_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB8_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB8_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB8_3: # %cond.store1 ; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: @@ -2903,24 +2834,25 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] -; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB8_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB8_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB8_3 +; AVX512F-NEXT: .LBB8_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB8_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB8_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB8_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB8_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2961,651 +2893,544 @@ ; SSE2-LABEL: truncstore_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB9_2: # %else -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB9_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pextrw $4, %xmm9, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) -; SSE2-NEXT: .LBB9_6: # %else4 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB9_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB9_3 +; SSE2-NEXT: .LBB9_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB9_5 +; SSE2-NEXT: .LBB9_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB9_7 ; SSE2-NEXT: .LBB9_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB9_9 ; SSE2-NEXT: .LBB9_10: # %else8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB9_11 ; SSE2-NEXT: .LBB9_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pextrw $4, %xmm4, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB9_13 ; SSE2-NEXT: .LBB9_14: # %else12 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB9_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movw %ax, 14(%rdi) +; SSE2-NEXT: .LBB9_15: # %cond.store13 +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 14(%rdi) ; SSE2-NEXT: .LBB9_16: # %else14 ; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_18 -; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movw %ax, 16(%rdi) -; SSE2-NEXT: .LBB9_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_20 -; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: movw %ax, 18(%rdi) +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: jne .LBB9_17 +; SSE2-NEXT: # %bb.18: # %else16 +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: jne .LBB9_19 ; SSE2-NEXT: .LBB9_20: # %else18 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_22 -; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movw %ax, 20(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: jne .LBB9_21 ; SSE2-NEXT: .LBB9_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_24 -; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: movw %ax, 22(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: jne .LBB9_23 ; SSE2-NEXT: .LBB9_24: # %else22 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_26 -; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movw %ax, 24(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: jne .LBB9_25 ; SSE2-NEXT: .LBB9_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_28 -; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: movw %ax, 26(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: jne .LBB9_27 ; SSE2-NEXT: .LBB9_28: # %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: movw %ax, 28(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: jne .LBB9_29 ; SSE2-NEXT: .LBB9_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB9_31 +; SSE2-NEXT: .LBB9_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB9_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB9_4 +; SSE2-NEXT: .LBB9_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB9_6 +; SSE2-NEXT: .LBB9_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB9_8 +; SSE2-NEXT: .LBB9_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB9_10 +; SSE2-NEXT: .LBB9_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB9_12 +; SSE2-NEXT: .LBB9_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB9_14 +; SSE2-NEXT: .LBB9_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB9_15 +; SSE2-NEXT: jmp .LBB9_16 +; SSE2-NEXT: .LBB9_17: # %cond.store15 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movw %cx, 16(%rdi) +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: je .LBB9_20 +; SSE2-NEXT: .LBB9_19: # %cond.store17 +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 18(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: je .LBB9_22 +; SSE2-NEXT: .LBB9_21: # %cond.store19 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 20(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: je .LBB9_24 +; SSE2-NEXT: .LBB9_23: # %cond.store21 +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 22(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: je .LBB9_26 +; SSE2-NEXT: .LBB9_25: # %cond.store23 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 24(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: je .LBB9_28 +; SSE2-NEXT: .LBB9_27: # %cond.store25 +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 26(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: je .LBB9_30 +; SSE2-NEXT: .LBB9_29: # %cond.store27 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 28(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB9_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB9_31: # %cond.store29 ; SSE2-NEXT: pextrw $7, %xmm2, %eax ; SSE2-NEXT: movw %ax, 30(%rdi) -; SSE2-NEXT: .LBB9_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm9, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: packssdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB9_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB9_3 +; SSE4-NEXT: .LBB9_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB9_5 +; SSE4-NEXT: .LBB9_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB9_7 +; SSE4-NEXT: .LBB9_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB9_9 +; SSE4-NEXT: .LBB9_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB9_11 +; SSE4-NEXT: .LBB9_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB9_13 +; SSE4-NEXT: .LBB9_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: je .LBB9_16 +; SSE4-NEXT: .LBB9_15: # %cond.store13 +; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) +; SSE4-NEXT: .LBB9_16: # %else14 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB9_17 +; SSE4-NEXT: # %bb.18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB9_19 +; SSE4-NEXT: .LBB9_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB9_21 +; SSE4-NEXT: .LBB9_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB9_23 +; SSE4-NEXT: .LBB9_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB9_25 +; SSE4-NEXT: .LBB9_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB9_27 +; SSE4-NEXT: .LBB9_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB9_29 +; SSE4-NEXT: .LBB9_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB9_31 +; SSE4-NEXT: .LBB9_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB9_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB9_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB9_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB9_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB9_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB9_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB9_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB9_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB9_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB9_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB9_8: # %else6 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB9_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB9_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB9_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB9_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB9_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB9_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB9_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB9_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB9_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB9_16: # %else14 -; SSE4-NEXT: packssdw %xmm3, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB9_15 +; SSE4-NEXT: jmp .LBB9_16 +; SSE4-NEXT: .LBB9_17: # %cond.store15 ; SSE4-NEXT: pextrw $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB9_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB9_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB9_19: # %cond.store17 ; SSE4-NEXT: pextrw $1, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB9_20: # %else18 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm6, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB9_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB9_21: # %cond.store19 ; SSE4-NEXT: pextrw $2, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB9_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB9_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB9_23: # %cond.store21 ; SSE4-NEXT: pextrw $3, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB9_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB9_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB9_25: # %cond.store23 ; SSE4-NEXT: pextrw $4, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB9_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB9_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB9_27: # %cond.store25 ; SSE4-NEXT: pextrw $5, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB9_28: # %else26 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB9_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB9_29: # %cond.store27 ; SSE4-NEXT: pextrw $6, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB9_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB9_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB9_31: # %cond.store29 ; SSE4-NEXT: pextrw $7, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB9_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpackssdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB9_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB9_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB9_3 ; AVX1-NEXT: .LBB9_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB9_5 ; AVX1-NEXT: .LBB9_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB9_7 ; AVX1-NEXT: .LBB9_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB9_9 ; AVX1-NEXT: .LBB9_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB9_11 ; AVX1-NEXT: .LBB9_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB9_13 ; AVX1-NEXT: .LBB9_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB9_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB9_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB9_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: jne .LBB9_17 +; AVX1-NEXT: # %bb.18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB9_19 +; AVX1-NEXT: .LBB9_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB9_21 +; AVX1-NEXT: .LBB9_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB9_23 +; AVX1-NEXT: .LBB9_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB9_25 +; AVX1-NEXT: .LBB9_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB9_27 +; AVX1-NEXT: .LBB9_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB9_29 +; AVX1-NEXT: .LBB9_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB9_31 +; AVX1-NEXT: .LBB9_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB9_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB9_4 +; AVX1-NEXT: .LBB9_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je .LBB9_6 +; AVX1-NEXT: .LBB9_5: # %cond.store3 +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je .LBB9_8 +; AVX1-NEXT: .LBB9_7: # %cond.store5 +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je .LBB9_10 +; AVX1-NEXT: .LBB9_9: # %cond.store7 +; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je .LBB9_12 +; AVX1-NEXT: .LBB9_11: # %cond.store9 +; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je .LBB9_14 +; AVX1-NEXT: .LBB9_13: # %cond.store11 +; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB9_15 +; AVX1-NEXT: jmp .LBB9_16 +; AVX1-NEXT: .LBB9_17: # %cond.store15 ; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB9_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB9_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB9_19: # %cond.store17 ; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX1-NEXT: .LBB9_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB9_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB9_21: # %cond.store19 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX1-NEXT: .LBB9_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB9_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB9_23: # %cond.store21 ; AVX1-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB9_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB9_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB9_25: # %cond.store23 ; AVX1-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB9_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB9_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB9_27: # %cond.store25 ; AVX1-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB9_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB9_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB9_29: # %cond.store27 ; AVX1-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB9_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB9_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB9_31: # %cond.store29 ; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB9_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB9_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB9_3 +; AVX2-NEXT: .LBB9_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB9_5 +; AVX2-NEXT: .LBB9_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB9_7 +; AVX2-NEXT: .LBB9_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB9_9 +; AVX2-NEXT: .LBB9_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB9_11 +; AVX2-NEXT: .LBB9_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB9_13 +; AVX2-NEXT: .LBB9_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je .LBB9_16 +; AVX2-NEXT: .LBB9_15: # %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX2-NEXT: .LBB9_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB9_17 +; AVX2-NEXT: # %bb.18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB9_19 +; AVX2-NEXT: .LBB9_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB9_21 +; AVX2-NEXT: .LBB9_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB9_23 +; AVX2-NEXT: .LBB9_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB9_25 +; AVX2-NEXT: .LBB9_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB9_27 +; AVX2-NEXT: .LBB9_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB9_29 +; AVX2-NEXT: .LBB9_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB9_31 +; AVX2-NEXT: .LBB9_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB9_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB9_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB9_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB9_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB9_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB9_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB9_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB9_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB9_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB9_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB9_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB9_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB9_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB9_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB9_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB9_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB9_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB9_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB9_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB9_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB9_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB9_15 +; AVX2-NEXT: jmp .LBB9_16 +; AVX2-NEXT: .LBB9_17: # %cond.store15 ; AVX2-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB9_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB9_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB9_19: # %cond.store17 ; AVX2-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB9_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB9_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB9_21: # %cond.store19 ; AVX2-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB9_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB9_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB9_23: # %cond.store21 ; AVX2-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB9_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB9_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB9_25: # %cond.store23 ; AVX2-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB9_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB9_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB9_27: # %cond.store25 ; AVX2-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB9_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB9_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB9_29: # %cond.store27 ; AVX2-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB9_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB9_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB9_31: # %cond.store29 ; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB9_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3615,116 +3440,117 @@ ; AVX512F-NEXT: vpmovsdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB9_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB9_3 +; AVX512F-NEXT: .LBB9_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB9_5 +; AVX512F-NEXT: .LBB9_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB9_7 +; AVX512F-NEXT: .LBB9_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB9_9 +; AVX512F-NEXT: .LBB9_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB9_11 +; AVX512F-NEXT: .LBB9_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB9_13 +; AVX512F-NEXT: .LBB9_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je .LBB9_16 +; AVX512F-NEXT: .LBB9_15: # %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512F-NEXT: .LBB9_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB9_17 +; AVX512F-NEXT: # %bb.18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB9_19 +; AVX512F-NEXT: .LBB9_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB9_21 +; AVX512F-NEXT: .LBB9_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB9_23 +; AVX512F-NEXT: .LBB9_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB9_25 +; AVX512F-NEXT: .LBB9_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB9_27 +; AVX512F-NEXT: .LBB9_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB9_29 +; AVX512F-NEXT: .LBB9_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB9_31 +; AVX512F-NEXT: .LBB9_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB9_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB9_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB9_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB9_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB9_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB9_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB9_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB9_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB9_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB9_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB9_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB9_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB9_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB9_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB9_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB9_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB9_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB9_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB9_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB9_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB9_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB9_15 +; AVX512F-NEXT: jmp .LBB9_16 +; AVX512F-NEXT: .LBB9_17: # %cond.store15 ; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB9_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB9_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB9_19: # %cond.store17 ; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB9_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB9_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB9_21: # %cond.store19 ; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB9_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB9_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB9_23: # %cond.store21 ; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB9_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB9_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB9_25: # %cond.store23 ; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB9_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB9_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB9_27: # %cond.store25 ; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB9_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB9_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB9_29: # %cond.store27 ; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB9_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB9_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB9_31: # %cond.store29 ; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB9_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3759,647 +3585,523 @@ ; SSE2-LABEL: truncstore_v16i32_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packsswb %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB10_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB10_2: # %else +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB10_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB10_3 ; SSE2-NEXT: .LBB10_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB10_5 ; SSE2-NEXT: .LBB10_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB10_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB10_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB10_10: # %else8 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB10_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: movb %ah, 5(%rdi) +; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB10_14: # %else12 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB10_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: movb %ah, 7(%rdi) +; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movb %al, 8(%rdi) +; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB10_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB10_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: movb %ah, 9(%rdi) +; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: movb %al, 10(%rdi) +; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB10_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB10_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: movb %ah, 11(%rdi) +; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: movb %al, 12(%rdi) +; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB10_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB10_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: movb %ah, 13(%rdi) +; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: je .LBB10_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: movb %al, 14(%rdi) -; SSE2-NEXT: .LBB10_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 -; SSE2-NEXT: movb %ah, 15(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: jne .LBB10_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB10_31 ; SSE2-NEXT: .LBB10_32: # %else30 ; SSE2-NEXT: retq +; SSE2-NEXT: .LBB10_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB10_4 +; SSE2-NEXT: .LBB10_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB10_6 +; SSE2-NEXT: .LBB10_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB10_7 +; SSE2-NEXT: jmp .LBB10_8 +; SSE2-NEXT: .LBB10_29: # %cond.store27 +; SSE2-NEXT: movb %cl, 14(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: je .LBB10_32 +; SSE2-NEXT: .LBB10_31: # %cond.store29 +; SSE2-NEXT: movb %ch, 15(%rdi) +; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm9, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packssdw %xmm1, %xmm0 ; SSE4-NEXT: packsswb %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB10_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB10_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB10_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB10_3 +; SSE4-NEXT: .LBB10_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB10_5 +; SSE4-NEXT: .LBB10_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB10_7 +; SSE4-NEXT: .LBB10_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB10_9 +; SSE4-NEXT: .LBB10_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB10_11 +; SSE4-NEXT: .LBB10_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB10_13 +; SSE4-NEXT: .LBB10_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB10_15 +; SSE4-NEXT: .LBB10_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB10_17 +; SSE4-NEXT: .LBB10_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB10_19 +; SSE4-NEXT: .LBB10_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB10_21 +; SSE4-NEXT: .LBB10_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB10_23 +; SSE4-NEXT: .LBB10_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB10_25 +; SSE4-NEXT: .LBB10_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB10_27 +; SSE4-NEXT: .LBB10_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB10_29 +; SSE4-NEXT: .LBB10_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB10_31 +; SSE4-NEXT: .LBB10_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB10_1: # %cond.store +; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB10_4 +; SSE4-NEXT: .LBB10_3: # %cond.store1 +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB10_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB10_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB10_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB10_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB10_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB10_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB10_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB10_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB10_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB10_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB10_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB10_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB10_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB10_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB10_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB10_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB10_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB10_16: # %else14 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB10_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB10_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB10_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB10_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB10_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB10_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm6, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB10_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB10_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB10_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB10_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB10_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB10_24: # %else22 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB10_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB10_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB10_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB10_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB10_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB10_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB10_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB10_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB10_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB10_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB10_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB10_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpackssdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB10_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB10_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB10_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB10_3 +; AVX1-NEXT: .LBB10_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB10_5 +; AVX1-NEXT: .LBB10_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB10_7 +; AVX1-NEXT: .LBB10_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB10_9 +; AVX1-NEXT: .LBB10_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB10_11 +; AVX1-NEXT: .LBB10_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB10_13 +; AVX1-NEXT: .LBB10_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB10_15 +; AVX1-NEXT: .LBB10_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB10_17 +; AVX1-NEXT: .LBB10_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB10_19 +; AVX1-NEXT: .LBB10_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB10_21 +; AVX1-NEXT: .LBB10_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB10_23 +; AVX1-NEXT: .LBB10_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB10_25 +; AVX1-NEXT: .LBB10_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB10_27 +; AVX1-NEXT: .LBB10_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB10_29 +; AVX1-NEXT: .LBB10_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB10_31 +; AVX1-NEXT: .LBB10_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB10_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB10_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB10_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB10_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB10_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB10_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB10_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB10_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB10_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB10_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB10_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB10_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB10_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB10_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB10_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB10_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB10_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB10_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB10_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB10_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB10_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB10_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB10_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB10_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB10_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB10_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB10_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB10_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB10_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB10_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB10_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB10_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB10_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB10_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB10_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB10_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB10_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB10_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB10_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB10_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB10_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB10_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB10_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB10_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB10_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB10_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB10_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB10_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB10_3 +; AVX2-NEXT: .LBB10_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB10_5 +; AVX2-NEXT: .LBB10_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB10_7 +; AVX2-NEXT: .LBB10_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB10_9 +; AVX2-NEXT: .LBB10_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB10_11 +; AVX2-NEXT: .LBB10_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB10_13 +; AVX2-NEXT: .LBB10_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB10_15 +; AVX2-NEXT: .LBB10_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB10_17 +; AVX2-NEXT: .LBB10_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB10_19 +; AVX2-NEXT: .LBB10_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB10_21 +; AVX2-NEXT: .LBB10_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB10_23 +; AVX2-NEXT: .LBB10_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB10_25 +; AVX2-NEXT: .LBB10_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB10_27 +; AVX2-NEXT: .LBB10_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB10_29 +; AVX2-NEXT: .LBB10_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB10_31 +; AVX2-NEXT: .LBB10_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB10_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB10_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB10_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB10_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB10_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB10_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB10_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB10_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB10_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB10_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB10_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB10_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB10_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB10_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB10_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB10_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB10_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB10_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB10_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB10_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB10_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB10_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB10_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB10_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB10_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB10_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB10_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB10_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB10_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB10_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB10_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB10_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB10_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB10_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB10_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB10_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB10_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB10_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB10_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB10_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB10_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB10_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB10_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB10_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB10_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB10_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB10_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4409,115 +4111,117 @@ ; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB10_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB10_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB10_3 +; AVX512F-NEXT: .LBB10_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB10_5 +; AVX512F-NEXT: .LBB10_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB10_7 +; AVX512F-NEXT: .LBB10_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB10_9 +; AVX512F-NEXT: .LBB10_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB10_11 +; AVX512F-NEXT: .LBB10_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB10_13 +; AVX512F-NEXT: .LBB10_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB10_15 +; AVX512F-NEXT: .LBB10_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB10_17 +; AVX512F-NEXT: .LBB10_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB10_19 +; AVX512F-NEXT: .LBB10_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB10_21 +; AVX512F-NEXT: .LBB10_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB10_23 +; AVX512F-NEXT: .LBB10_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB10_25 +; AVX512F-NEXT: .LBB10_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB10_27 +; AVX512F-NEXT: .LBB10_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB10_29 +; AVX512F-NEXT: .LBB10_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB10_31 +; AVX512F-NEXT: .LBB10_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB10_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB10_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB10_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB10_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB10_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB10_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB10_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB10_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB10_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB10_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB10_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB10_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB10_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB10_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB10_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB10_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB10_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB10_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB10_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB10_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB10_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB10_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB10_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB10_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB10_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB10_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB10_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB10_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB10_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB10_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB10_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB10_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB10_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB10_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB10_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB10_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB10_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB10_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB10_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB10_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB10_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB10_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB10_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB10_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB10_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB10_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB10_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4551,303 +4255,284 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB11_2: # %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB11_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: jne .LBB11_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB11_3 +; SSE2-NEXT: .LBB11_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB11_5 ; SSE2-NEXT: .LBB11_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB11_7 ; SSE2-NEXT: .LBB11_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB11_9 ; SSE2-NEXT: .LBB11_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB11_11 ; SSE2-NEXT: .LBB11_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB11_13 ; SSE2-NEXT: .LBB11_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB11_15 +; SSE2-NEXT: .LBB11_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB11_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB11_4 +; SSE2-NEXT: .LBB11_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB11_6 +; SSE2-NEXT: .LBB11_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB11_8 +; SSE2-NEXT: .LBB11_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB11_10 +; SSE2-NEXT: .LBB11_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB11_12 +; SSE2-NEXT: .LBB11_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB11_14 +; SSE2-NEXT: .LBB11_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB11_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB11_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB11_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: packssdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm4, %eax +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB11_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB11_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB11_3 +; SSE4-NEXT: .LBB11_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB11_5 +; SSE4-NEXT: .LBB11_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB11_7 +; SSE4-NEXT: .LBB11_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB11_9 +; SSE4-NEXT: .LBB11_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB11_11 +; SSE4-NEXT: .LBB11_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB11_13 +; SSE4-NEXT: .LBB11_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB11_15 +; SSE4-NEXT: .LBB11_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB11_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB11_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB11_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB11_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB11_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB11_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB11_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB11_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB11_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB11_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB11_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB11_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB11_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB11_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB11_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB11_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB11_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB11_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB11_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB11_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB11_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB11_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB11_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB11_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB11_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB11_3 +; AVX1-NEXT: .LBB11_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB11_5 +; AVX1-NEXT: .LBB11_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB11_7 +; AVX1-NEXT: .LBB11_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB11_9 +; AVX1-NEXT: .LBB11_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB11_11 +; AVX1-NEXT: .LBB11_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB11_13 +; AVX1-NEXT: .LBB11_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB11_15 +; AVX1-NEXT: .LBB11_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB11_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB11_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB11_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB11_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB11_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB11_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB11_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB11_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB11_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB11_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB11_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB11_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB11_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB11_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB11_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB11_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB11_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB11_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB11_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB11_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB11_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB11_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB11_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB11_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB11_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB11_3 +; AVX2-NEXT: .LBB11_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB11_5 +; AVX2-NEXT: .LBB11_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB11_7 +; AVX2-NEXT: .LBB11_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB11_9 +; AVX2-NEXT: .LBB11_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB11_11 +; AVX2-NEXT: .LBB11_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB11_13 +; AVX2-NEXT: .LBB11_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB11_15 +; AVX2-NEXT: .LBB11_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB11_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB11_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB11_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB11_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB11_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB11_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB11_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB11_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB11_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB11_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB11_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB11_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB11_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB11_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB11_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB11_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB11_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB11_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB11_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB11_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB11_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB11_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB11_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4855,73 +4540,68 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB11_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB11_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB11_3 +; AVX512F-NEXT: .LBB11_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB11_5 +; AVX512F-NEXT: .LBB11_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB11_7 +; AVX512F-NEXT: .LBB11_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB11_9 +; AVX512F-NEXT: .LBB11_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB11_11 +; AVX512F-NEXT: .LBB11_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB11_13 +; AVX512F-NEXT: .LBB11_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB11_15 +; AVX512F-NEXT: .LBB11_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB11_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB11_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB11_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB11_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB11_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB11_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB11_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB11_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB11_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB11_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB11_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB11_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB11_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB11_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB11_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB11_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB11_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB11_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB11_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB11_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB11_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB11_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB11_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4961,341 +4641,322 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm0, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm6, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm0, %xmm6 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB12_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB12_2: # %else -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: movd %xmm4, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB12_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB12_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB12_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB12_3 +; SSE2-NEXT: .LBB12_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB12_5 ; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB12_7 ; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB12_9 ; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB12_11 ; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB12_15 ; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq -; -; SSE4-LABEL: truncstore_v8i32_v8i8: -; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] -; SSE4-NEXT: pminsd %xmm5, %xmm0 -; SSE4-NEXT: pminsd %xmm5, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] -; SSE4-NEXT: pmaxsd %xmm5, %xmm1 -; SSE4-NEXT: pmaxsd %xmm5, %xmm0 -; SSE4-NEXT: packssdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB12_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB12_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: .LBB12_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB12_4 +; SSE2-NEXT: .LBB12_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB12_6 +; SSE2-NEXT: .LBB12_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB12_8 +; SSE2-NEXT: .LBB12_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB12_10 +; SSE2-NEXT: .LBB12_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB12_12 +; SSE2-NEXT: .LBB12_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB12_14 +; SSE2-NEXT: .LBB12_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: je .LBB12_16 +; SSE2-NEXT: .LBB12_15: # %cond.store13 +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: movb %al, 7(%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: truncstore_v8i32_v8i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] +; SSE4-NEXT: pminsd %xmm5, %xmm0 +; SSE4-NEXT: pminsd %xmm5, %xmm1 +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] +; SSE4-NEXT: pmaxsd %xmm5, %xmm1 +; SSE4-NEXT: pmaxsd %xmm5, %xmm0 +; SSE4-NEXT: packssdw %xmm1, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax ; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB12_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB12_3 +; SSE4-NEXT: .LBB12_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB12_5 +; SSE4-NEXT: .LBB12_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB12_7 +; SSE4-NEXT: .LBB12_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB12_9 +; SSE4-NEXT: .LBB12_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB12_11 +; SSE4-NEXT: .LBB12_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB12_13 +; SSE4-NEXT: .LBB12_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB12_15 +; SSE4-NEXT: .LBB12_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB12_1: # %cond.store +; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB12_4 +; SSE4-NEXT: .LBB12_3: # %cond.store1 +; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB12_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB12_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB12_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB12_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB12_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB12_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB12_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB12_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB12_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB12_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB12_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB12_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB12_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB12_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB12_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB12_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB12_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127] -; AVX1-NEXT: vpminsd %xmm5, %xmm0, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127] +; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminsd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] -; AVX1-NEXT: vpmaxsd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB12_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB12_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB12_3 +; AVX1-NEXT: .LBB12_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB12_5 +; AVX1-NEXT: .LBB12_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB12_7 +; AVX1-NEXT: .LBB12_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB12_9 +; AVX1-NEXT: .LBB12_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB12_11 +; AVX1-NEXT: .LBB12_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB12_13 +; AVX1-NEXT: .LBB12_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB12_15 +; AVX1-NEXT: .LBB12_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB12_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB12_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB12_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB12_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB12_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB12_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB12_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB12_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB12_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB12_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB12_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB12_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB12_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB12_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB12_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB12_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB12_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB12_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB12_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB12_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB12_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB12_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127] -; AVX2-NEXT: vpminsd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB12_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB12_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB12_3 +; AVX2-NEXT: .LBB12_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB12_5 +; AVX2-NEXT: .LBB12_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB12_7 +; AVX2-NEXT: .LBB12_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB12_9 +; AVX2-NEXT: .LBB12_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB12_11 +; AVX2-NEXT: .LBB12_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB12_13 +; AVX2-NEXT: .LBB12_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB12_15 +; AVX2-NEXT: .LBB12_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB12_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB12_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB12_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB12_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB12_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB12_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB12_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB12_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB12_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB12_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB12_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB12_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB12_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB12_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB12_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB12_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB12_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB12_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB12_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB12_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB12_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB12_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5303,73 +4964,68 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB12_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB12_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB12_3 +; AVX512F-NEXT: .LBB12_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB12_5 +; AVX512F-NEXT: .LBB12_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB12_7 +; AVX512F-NEXT: .LBB12_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB12_9 +; AVX512F-NEXT: .LBB12_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB12_11 +; AVX512F-NEXT: .LBB12_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB12_13 +; AVX512F-NEXT: .LBB12_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB12_15 +; AVX512F-NEXT: .LBB12_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB12_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB12_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB12_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB12_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB12_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB12_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB12_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB12_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB12_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB12_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB12_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB12_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB12_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB12_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB12_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB12_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB12_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB12_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB12_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB12_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB12_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB12_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5411,7 +5067,6 @@ ; SSE2-LABEL: truncstore_v4i32_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 @@ -5424,198 +5079,192 @@ ; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB13_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB13_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB13_3 ; SSE2-NEXT: .LBB13_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB13_5 ; SSE2-NEXT: .LBB13_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB13_7 +; SSE2-NEXT: .LBB13_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB13_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB13_4 +; SSE2-NEXT: .LBB13_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB13_6 +; SSE2-NEXT: .LBB13_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB13_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB13_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB13_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB13_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB13_3 +; SSE4-NEXT: .LBB13_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB13_5 +; SSE4-NEXT: .LBB13_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB13_7 +; SSE4-NEXT: .LBB13_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB13_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB13_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB13_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB13_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB13_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB13_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB13_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB13_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB13_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB13_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i32_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB13_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB13_3 +; AVX1-NEXT: .LBB13_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB13_5 +; AVX1-NEXT: .LBB13_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB13_7 +; AVX1-NEXT: .LBB13_8: # %else6 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB13_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB13_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB13_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB13_3: # %cond.store1 ; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB13_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB13_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB13_5: # %cond.store3 ; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB13_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB13_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB13_7: # %cond.store5 ; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB13_8: # %else6 ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i32_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32767,32767,32767,32767] ; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294934528,4294934528,4294934528,4294934528] ; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB13_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB13_3 +; AVX2-NEXT: .LBB13_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB13_5 +; AVX2-NEXT: .LBB13_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB13_7 +; AVX2-NEXT: .LBB13_8: # %else6 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB13_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB13_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB13_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB13_3: # %cond.store1 ; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB13_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB13_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB13_5: # %cond.store3 ; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB13_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB13_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB13_7: # %cond.store5 ; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB13_8: # %else6 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB13_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB13_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB13_3 +; AVX512F-NEXT: .LBB13_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB13_5 +; AVX512F-NEXT: .LBB13_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB13_7 +; AVX512F-NEXT: .LBB13_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB13_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB13_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB13_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB13_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB13_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB13_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB13_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB13_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB13_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB13_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5655,7 +5304,6 @@ ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 @@ -5668,198 +5316,192 @@ ; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB14_2: # %else -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: jne .LBB14_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB14_3 ; SSE2-NEXT: .LBB14_4: # %else2 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pextrw $4, %xmm3, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB14_5 ; SSE2-NEXT: .LBB14_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB14_7 +; SSE2-NEXT: .LBB14_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB14_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB14_4 +; SSE2-NEXT: .LBB14_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB14_6 +; SSE2-NEXT: .LBB14_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB14_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB14_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB14_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB14_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB14_3 +; SSE4-NEXT: .LBB14_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB14_5 +; SSE4-NEXT: .LBB14_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB14_7 +; SSE4-NEXT: .LBB14_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB14_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB14_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB14_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB14_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB14_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB14_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB14_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB14_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB14_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB14_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i32_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB14_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB14_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB14_3 +; AVX1-NEXT: .LBB14_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB14_5 +; AVX1-NEXT: .LBB14_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB14_7 +; AVX1-NEXT: .LBB14_8: # %else6 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB14_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB14_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB14_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB14_3: # %cond.store1 ; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB14_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB14_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB14_5: # %cond.store3 ; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB14_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB14_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB14_7: # %cond.store5 ; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB14_8: # %else6 ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i32_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [127,127,127,127] ; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168] ; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB14_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB14_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB14_3 +; AVX2-NEXT: .LBB14_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB14_5 +; AVX2-NEXT: .LBB14_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB14_7 +; AVX2-NEXT: .LBB14_8: # %else6 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB14_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB14_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB14_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB14_3: # %cond.store1 ; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB14_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB14_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB14_5: # %cond.store3 ; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB14_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB14_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB14_7: # %cond.store5 ; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB14_8: # %else6 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127] -; AVX512F-NEXT: vpminsd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB14_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB14_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB14_3 +; AVX512F-NEXT: .LBB14_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB14_5 +; AVX512F-NEXT: .LBB14_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB14_7 +; AVX512F-NEXT: .LBB14_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB14_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB14_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB14_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB14_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB14_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB14_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB14_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB14_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB14_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB14_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5899,1503 +5541,1172 @@ ; SSE2-LABEL: truncstore_v32i16_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm6 ; SSE2-NEXT: packsswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm6, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %ecx +; SSE2-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm6, %xmm5 +; SSE2-NEXT: pmovmskb %xmm5, %eax +; SSE2-NEXT: notl %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB15_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB15_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB15_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB15_3 ; SSE2-NEXT: .LBB15_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB15_5 ; SSE2-NEXT: .LBB15_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB15_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB15_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB15_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB15_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB15_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB15_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB15_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB15_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB15_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB15_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB15_26: # %else24 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB15_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB15_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) ; SSE2-NEXT: .LBB15_30: # %else28 ; SSE2-NEXT: packsswb %xmm3, %xmm2 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB15_32 ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: je .LBB15_34 -; SSE2-NEXT: # %bb.33: # %cond.store31 -; SSE2-NEXT: movb %al, 16(%rdi) -; SSE2-NEXT: .LBB15_34: # %else32 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_36 -; SSE2-NEXT: # %bb.35: # %cond.store33 -; SSE2-NEXT: movb %ah, 17(%rdi) +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: jne .LBB15_33 +; SSE2-NEXT: # %bb.34: # %else32 +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: jne .LBB15_35 ; SSE2-NEXT: .LBB15_36: # %else34 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_38 -; SSE2-NEXT: # %bb.37: # %cond.store35 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: jne .LBB15_37 ; SSE2-NEXT: .LBB15_38: # %else36 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE2-NEXT: je .LBB15_40 -; SSE2-NEXT: # %bb.39: # %cond.store37 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 19(%rdi) +; SSE2-NEXT: .LBB15_39: # %cond.store37 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) ; SSE2-NEXT: .LBB15_42: # %else40 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE2-NEXT: je .LBB15_44 ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) ; SSE2-NEXT: .LBB15_46: # %else44 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE2-NEXT: je .LBB15_48 ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) ; SSE2-NEXT: .LBB15_50: # %else48 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE2-NEXT: je .LBB15_52 ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) ; SSE2-NEXT: .LBB15_54: # %else52 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE2-NEXT: je .LBB15_56 ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) ; SSE2-NEXT: .LBB15_58: # %else56 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE2-NEXT: je .LBB15_60 ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE2-NEXT: pextrw $7, %xmm5, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx -; SSE2-NEXT: je .LBB15_62 -; SSE2-NEXT: # %bb.61: # %cond.store59 +; SSE2-NEXT: jne .LBB15_61 +; SSE2-NEXT: # %bb.62: # %else60 +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE2-NEXT: jne .LBB15_63 +; SSE2-NEXT: .LBB15_64: # %else62 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB15_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB15_4 +; SSE2-NEXT: .LBB15_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB15_6 +; SSE2-NEXT: .LBB15_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB15_7 +; SSE2-NEXT: jmp .LBB15_8 +; SSE2-NEXT: .LBB15_33: # %cond.store31 +; SSE2-NEXT: movb %cl, 16(%rdi) +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: je .LBB15_36 +; SSE2-NEXT: .LBB15_35: # %cond.store33 +; SSE2-NEXT: movb %ch, 17(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: je .LBB15_38 +; SSE2-NEXT: .LBB15_37: # %cond.store35 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE2-NEXT: jne .LBB15_39 +; SSE2-NEXT: jmp .LBB15_40 +; SSE2-NEXT: .LBB15_61: # %cond.store59 ; SSE2-NEXT: movb %cl, 30(%rdi) -; SSE2-NEXT: .LBB15_62: # %else60 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE2-NEXT: je .LBB15_64 -; SSE2-NEXT: # %bb.63: # %cond.store61 +; SSE2-NEXT: .LBB15_63: # %cond.store61 ; SSE2-NEXT: movb %ch, 31(%rdi) -; SSE2-NEXT: .LBB15_64: # %else62 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm6, %xmm6 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm6 ; SSE4-NEXT: packsswb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %ecx +; SSE4-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE4-NEXT: pcmpeqb %xmm6, %xmm5 +; SSE4-NEXT: pmovmskb %xmm5, %eax +; SSE4-NEXT: notl %eax +; SSE4-NEXT: shll $16, %eax +; SSE4-NEXT: orl %ecx, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB15_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB15_3 +; SSE4-NEXT: .LBB15_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB15_5 +; SSE4-NEXT: .LBB15_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB15_7 +; SSE4-NEXT: .LBB15_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB15_9 +; SSE4-NEXT: .LBB15_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB15_11 +; SSE4-NEXT: .LBB15_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB15_13 +; SSE4-NEXT: .LBB15_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB15_15 +; SSE4-NEXT: .LBB15_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB15_17 +; SSE4-NEXT: .LBB15_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB15_19 +; SSE4-NEXT: .LBB15_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB15_21 +; SSE4-NEXT: .LBB15_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB15_23 +; SSE4-NEXT: .LBB15_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB15_25 +; SSE4-NEXT: .LBB15_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB15_27 +; SSE4-NEXT: .LBB15_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB15_29 +; SSE4-NEXT: .LBB15_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: je .LBB15_32 +; SSE4-NEXT: .LBB15_31: # %cond.store29 +; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) +; SSE4-NEXT: .LBB15_32: # %else30 +; SSE4-NEXT: packsswb %xmm3, %xmm2 +; SSE4-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE4-NEXT: jne .LBB15_33 +; SSE4-NEXT: # %bb.34: # %else32 +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE4-NEXT: jne .LBB15_35 +; SSE4-NEXT: .LBB15_36: # %else34 +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE4-NEXT: jne .LBB15_37 +; SSE4-NEXT: .LBB15_38: # %else36 +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE4-NEXT: jne .LBB15_39 +; SSE4-NEXT: .LBB15_40: # %else38 +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 +; SSE4-NEXT: jne .LBB15_41 +; SSE4-NEXT: .LBB15_42: # %else40 +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 +; SSE4-NEXT: jne .LBB15_43 +; SSE4-NEXT: .LBB15_44: # %else42 +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 +; SSE4-NEXT: jne .LBB15_45 +; SSE4-NEXT: .LBB15_46: # %else44 +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 +; SSE4-NEXT: jne .LBB15_47 +; SSE4-NEXT: .LBB15_48: # %else46 +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 +; SSE4-NEXT: jne .LBB15_49 +; SSE4-NEXT: .LBB15_50: # %else48 +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 +; SSE4-NEXT: jne .LBB15_51 +; SSE4-NEXT: .LBB15_52: # %else50 +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 +; SSE4-NEXT: jne .LBB15_53 +; SSE4-NEXT: .LBB15_54: # %else52 +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 +; SSE4-NEXT: jne .LBB15_55 +; SSE4-NEXT: .LBB15_56: # %else54 +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 +; SSE4-NEXT: jne .LBB15_57 +; SSE4-NEXT: .LBB15_58: # %else56 +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 +; SSE4-NEXT: jne .LBB15_59 +; SSE4-NEXT: .LBB15_60: # %else58 +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; SSE4-NEXT: jne .LBB15_61 +; SSE4-NEXT: .LBB15_62: # %else60 +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE4-NEXT: jne .LBB15_63 +; SSE4-NEXT: .LBB15_64: # %else62 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB15_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB15_2: # %else -; SSE4-NEXT: pextrb $1, %xmm6, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB15_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB15_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB15_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB15_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB15_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB15_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB15_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB15_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB15_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB15_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB15_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB15_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB15_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB15_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB15_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB15_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB15_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB15_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB15_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB15_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB15_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB15_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB15_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB15_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB15_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB15_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB15_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB15_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB15_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB15_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB15_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB15_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB15_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB15_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB15_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB15_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB15_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB15_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB15_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE4-NEXT: pextrb $14, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB15_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB15_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB15_30: # %else28 -; SSE4-NEXT: pextrb $15, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 -; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB15_32: # %else30 -; SSE4-NEXT: packsswb %xmm3, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_34 -; SSE4-NEXT: # %bb.33: # %cond.store31 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB15_31 +; SSE4-NEXT: jmp .LBB15_32 +; SSE4-NEXT: .LBB15_33: # %cond.store31 ; SSE4-NEXT: pextrb $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB15_34: # %else32 -; SSE4-NEXT: pextrb $1, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 ; SSE4-NEXT: je .LBB15_36 -; SSE4-NEXT: # %bb.35: # %cond.store33 +; SSE4-NEXT: .LBB15_35: # %cond.store33 ; SSE4-NEXT: pextrb $1, %xmm2, 17(%rdi) -; SSE4-NEXT: .LBB15_36: # %else34 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $2, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 ; SSE4-NEXT: je .LBB15_38 -; SSE4-NEXT: # %bb.37: # %cond.store35 +; SSE4-NEXT: .LBB15_37: # %cond.store35 ; SSE4-NEXT: pextrb $2, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB15_38: # %else36 -; SSE4-NEXT: pextrb $3, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE4-NEXT: je .LBB15_40 -; SSE4-NEXT: # %bb.39: # %cond.store37 +; SSE4-NEXT: .LBB15_39: # %cond.store37 ; SSE4-NEXT: pextrb $3, %xmm2, 19(%rdi) -; SSE4-NEXT: .LBB15_40: # %else38 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE4-NEXT: je .LBB15_42 -; SSE4-NEXT: # %bb.41: # %cond.store39 +; SSE4-NEXT: .LBB15_41: # %cond.store39 ; SSE4-NEXT: pextrb $4, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB15_42: # %else40 -; SSE4-NEXT: pextrb $5, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE4-NEXT: je .LBB15_44 -; SSE4-NEXT: # %bb.43: # %cond.store41 +; SSE4-NEXT: .LBB15_43: # %cond.store41 ; SSE4-NEXT: pextrb $5, %xmm2, 21(%rdi) -; SSE4-NEXT: .LBB15_44: # %else42 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $6, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE4-NEXT: je .LBB15_46 -; SSE4-NEXT: # %bb.45: # %cond.store43 +; SSE4-NEXT: .LBB15_45: # %cond.store43 ; SSE4-NEXT: pextrb $6, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB15_46: # %else44 -; SSE4-NEXT: pextrb $7, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE4-NEXT: je .LBB15_48 -; SSE4-NEXT: # %bb.47: # %cond.store45 +; SSE4-NEXT: .LBB15_47: # %cond.store45 ; SSE4-NEXT: pextrb $7, %xmm2, 23(%rdi) -; SSE4-NEXT: .LBB15_48: # %else46 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE4-NEXT: je .LBB15_50 -; SSE4-NEXT: # %bb.49: # %cond.store47 +; SSE4-NEXT: .LBB15_49: # %cond.store47 ; SSE4-NEXT: pextrb $8, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB15_50: # %else48 -; SSE4-NEXT: pextrb $9, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE4-NEXT: je .LBB15_52 -; SSE4-NEXT: # %bb.51: # %cond.store49 +; SSE4-NEXT: .LBB15_51: # %cond.store49 ; SSE4-NEXT: pextrb $9, %xmm2, 25(%rdi) -; SSE4-NEXT: .LBB15_52: # %else50 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $10, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE4-NEXT: je .LBB15_54 -; SSE4-NEXT: # %bb.53: # %cond.store51 +; SSE4-NEXT: .LBB15_53: # %cond.store51 ; SSE4-NEXT: pextrb $10, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB15_54: # %else52 -; SSE4-NEXT: pextrb $11, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE4-NEXT: je .LBB15_56 -; SSE4-NEXT: # %bb.55: # %cond.store53 +; SSE4-NEXT: .LBB15_55: # %cond.store53 ; SSE4-NEXT: pextrb $11, %xmm2, 27(%rdi) -; SSE4-NEXT: .LBB15_56: # %else54 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE4-NEXT: je .LBB15_58 -; SSE4-NEXT: # %bb.57: # %cond.store55 +; SSE4-NEXT: .LBB15_57: # %cond.store55 ; SSE4-NEXT: pextrb $12, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB15_58: # %else56 -; SSE4-NEXT: pextrb $13, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE4-NEXT: je .LBB15_60 -; SSE4-NEXT: # %bb.59: # %cond.store57 +; SSE4-NEXT: .LBB15_59: # %cond.store57 ; SSE4-NEXT: pextrb $13, %xmm2, 29(%rdi) -; SSE4-NEXT: .LBB15_60: # %else58 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE4-NEXT: pextrb $14, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE4-NEXT: je .LBB15_62 -; SSE4-NEXT: # %bb.61: # %cond.store59 +; SSE4-NEXT: .LBB15_61: # %cond.store59 ; SSE4-NEXT: pextrb $14, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB15_62: # %else60 -; SSE4-NEXT: pextrb $15, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE4-NEXT: je .LBB15_64 -; SSE4-NEXT: # %bb.63: # %cond.store61 +; SSE4-NEXT: .LBB15_63: # %cond.store61 ; SSE4-NEXT: pextrb $15, %xmm2, 31(%rdi) -; SSE4-NEXT: .LBB15_64: # %else62 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpacksswb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB15_2: # %else -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB15_4: # %else2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $2, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB15_6: # %else4 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpmovmskb %xmm3, %ecx +; AVX1-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB15_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB15_3 +; AVX1-NEXT: .LBB15_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB15_5 +; AVX1-NEXT: .LBB15_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB15_7 ; AVX1-NEXT: .LBB15_8: # %else6 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB15_9 +; AVX1-NEXT: .LBB15_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB15_11 +; AVX1-NEXT: .LBB15_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB15_13 +; AVX1-NEXT: .LBB15_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB15_15 +; AVX1-NEXT: .LBB15_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB15_17 +; AVX1-NEXT: .LBB15_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB15_19 +; AVX1-NEXT: .LBB15_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB15_21 +; AVX1-NEXT: .LBB15_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB15_23 +; AVX1-NEXT: .LBB15_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB15_25 +; AVX1-NEXT: .LBB15_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB15_27 +; AVX1-NEXT: .LBB15_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB15_29 +; AVX1-NEXT: .LBB15_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: je .LBB15_32 +; AVX1-NEXT: .LBB15_31: # %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX1-NEXT: .LBB15_32: # %else30 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: jne .LBB15_33 +; AVX1-NEXT: # %bb.34: # %else32 +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX1-NEXT: jne .LBB15_35 +; AVX1-NEXT: .LBB15_36: # %else34 +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX1-NEXT: jne .LBB15_37 +; AVX1-NEXT: .LBB15_38: # %else36 +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX1-NEXT: jne .LBB15_39 +; AVX1-NEXT: .LBB15_40: # %else38 +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX1-NEXT: jne .LBB15_41 +; AVX1-NEXT: .LBB15_42: # %else40 +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX1-NEXT: jne .LBB15_43 +; AVX1-NEXT: .LBB15_44: # %else42 +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX1-NEXT: jne .LBB15_45 +; AVX1-NEXT: .LBB15_46: # %else44 +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX1-NEXT: jne .LBB15_47 +; AVX1-NEXT: .LBB15_48: # %else46 +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX1-NEXT: jne .LBB15_49 +; AVX1-NEXT: .LBB15_50: # %else48 +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX1-NEXT: jne .LBB15_51 +; AVX1-NEXT: .LBB15_52: # %else50 +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX1-NEXT: jne .LBB15_53 +; AVX1-NEXT: .LBB15_54: # %else52 +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX1-NEXT: jne .LBB15_55 +; AVX1-NEXT: .LBB15_56: # %else54 +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX1-NEXT: jne .LBB15_57 +; AVX1-NEXT: .LBB15_58: # %else56 +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX1-NEXT: jne .LBB15_59 +; AVX1-NEXT: .LBB15_60: # %else58 +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX1-NEXT: jne .LBB15_61 +; AVX1-NEXT: .LBB15_62: # %else60 +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX1-NEXT: jne .LBB15_63 +; AVX1-NEXT: .LBB15_64: # %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB15_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB15_4 +; AVX1-NEXT: .LBB15_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je .LBB15_6 +; AVX1-NEXT: .LBB15_5: # %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je .LBB15_8 +; AVX1-NEXT: .LBB15_7: # %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB15_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB15_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB15_10: # %else8 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB15_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB15_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB15_12: # %else10 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $6, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB15_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB15_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB15_14: # %else12 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB15_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB15_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB15_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB15_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB15_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB15_18: # %else16 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB15_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB15_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB15_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB15_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB15_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB15_22: # %else20 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB15_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB15_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB15_24: # %else22 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $12, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB15_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB15_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB15_26: # %else24 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB15_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB15_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB15_28: # %else26 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $14, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB15_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB15_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB15_30: # %else28 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 -; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB15_34 -; AVX1-NEXT: # %bb.33: # %cond.store31 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB15_31 +; AVX1-NEXT: jmp .LBB15_32 +; AVX1-NEXT: .LBB15_33: # %cond.store31 ; AVX1-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB15_34: # %else32 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX1-NEXT: je .LBB15_36 -; AVX1-NEXT: # %bb.35: # %cond.store33 +; AVX1-NEXT: .LBB15_35: # %cond.store33 ; AVX1-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX1-NEXT: .LBB15_36: # %else34 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX1-NEXT: je .LBB15_38 -; AVX1-NEXT: # %bb.37: # %cond.store35 +; AVX1-NEXT: .LBB15_37: # %cond.store35 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX1-NEXT: .LBB15_38: # %else36 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX1-NEXT: je .LBB15_40 -; AVX1-NEXT: # %bb.39: # %cond.store37 +; AVX1-NEXT: .LBB15_39: # %cond.store37 ; AVX1-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX1-NEXT: .LBB15_40: # %else38 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX1-NEXT: je .LBB15_42 -; AVX1-NEXT: # %bb.41: # %cond.store39 +; AVX1-NEXT: .LBB15_41: # %cond.store39 ; AVX1-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX1-NEXT: .LBB15_42: # %else40 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX1-NEXT: je .LBB15_44 -; AVX1-NEXT: # %bb.43: # %cond.store41 +; AVX1-NEXT: .LBB15_43: # %cond.store41 ; AVX1-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX1-NEXT: .LBB15_44: # %else42 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX1-NEXT: je .LBB15_46 -; AVX1-NEXT: # %bb.45: # %cond.store43 +; AVX1-NEXT: .LBB15_45: # %cond.store43 ; AVX1-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB15_46: # %else44 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX1-NEXT: je .LBB15_48 -; AVX1-NEXT: # %bb.47: # %cond.store45 +; AVX1-NEXT: .LBB15_47: # %cond.store45 ; AVX1-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX1-NEXT: .LBB15_48: # %else46 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX1-NEXT: je .LBB15_50 -; AVX1-NEXT: # %bb.49: # %cond.store47 +; AVX1-NEXT: .LBB15_49: # %cond.store47 ; AVX1-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB15_50: # %else48 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX1-NEXT: je .LBB15_52 -; AVX1-NEXT: # %bb.51: # %cond.store49 +; AVX1-NEXT: .LBB15_51: # %cond.store49 ; AVX1-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX1-NEXT: .LBB15_52: # %else50 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX1-NEXT: je .LBB15_54 -; AVX1-NEXT: # %bb.53: # %cond.store51 +; AVX1-NEXT: .LBB15_53: # %cond.store51 ; AVX1-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB15_54: # %else52 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX1-NEXT: je .LBB15_56 -; AVX1-NEXT: # %bb.55: # %cond.store53 +; AVX1-NEXT: .LBB15_55: # %cond.store53 ; AVX1-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX1-NEXT: .LBB15_56: # %else54 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX1-NEXT: je .LBB15_58 -; AVX1-NEXT: # %bb.57: # %cond.store55 +; AVX1-NEXT: .LBB15_57: # %cond.store55 ; AVX1-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB15_58: # %else56 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX1-NEXT: je .LBB15_60 -; AVX1-NEXT: # %bb.59: # %cond.store57 +; AVX1-NEXT: .LBB15_59: # %cond.store57 ; AVX1-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX1-NEXT: .LBB15_60: # %else58 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX1-NEXT: je .LBB15_62 -; AVX1-NEXT: # %bb.61: # %cond.store59 +; AVX1-NEXT: .LBB15_61: # %cond.store59 ; AVX1-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB15_62: # %else60 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: je .LBB15_64 -; AVX1-NEXT: # %bb.63: # %cond.store61 +; AVX1-NEXT: .LBB15_63: # %cond.store61 ; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX1-NEXT: .LBB15_64: # %else62 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpmovmskb %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB15_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB15_3 +; AVX2-NEXT: .LBB15_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB15_5 +; AVX2-NEXT: .LBB15_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB15_7 +; AVX2-NEXT: .LBB15_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB15_9 +; AVX2-NEXT: .LBB15_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB15_11 +; AVX2-NEXT: .LBB15_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB15_13 +; AVX2-NEXT: .LBB15_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB15_15 +; AVX2-NEXT: .LBB15_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB15_17 +; AVX2-NEXT: .LBB15_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB15_19 +; AVX2-NEXT: .LBB15_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB15_21 +; AVX2-NEXT: .LBB15_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB15_23 +; AVX2-NEXT: .LBB15_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB15_25 +; AVX2-NEXT: .LBB15_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB15_27 +; AVX2-NEXT: .LBB15_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB15_29 +; AVX2-NEXT: .LBB15_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: je .LBB15_32 +; AVX2-NEXT: .LBB15_31: # %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX2-NEXT: .LBB15_32: # %else30 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB15_33 +; AVX2-NEXT: # %bb.34: # %else32 +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX2-NEXT: jne .LBB15_35 +; AVX2-NEXT: .LBB15_36: # %else34 +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX2-NEXT: jne .LBB15_37 +; AVX2-NEXT: .LBB15_38: # %else36 +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX2-NEXT: jne .LBB15_39 +; AVX2-NEXT: .LBB15_40: # %else38 +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX2-NEXT: jne .LBB15_41 +; AVX2-NEXT: .LBB15_42: # %else40 +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX2-NEXT: jne .LBB15_43 +; AVX2-NEXT: .LBB15_44: # %else42 +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX2-NEXT: jne .LBB15_45 +; AVX2-NEXT: .LBB15_46: # %else44 +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX2-NEXT: jne .LBB15_47 +; AVX2-NEXT: .LBB15_48: # %else46 +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX2-NEXT: jne .LBB15_49 +; AVX2-NEXT: .LBB15_50: # %else48 +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX2-NEXT: jne .LBB15_51 +; AVX2-NEXT: .LBB15_52: # %else50 +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX2-NEXT: jne .LBB15_53 +; AVX2-NEXT: .LBB15_54: # %else52 +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX2-NEXT: jne .LBB15_55 +; AVX2-NEXT: .LBB15_56: # %else54 +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX2-NEXT: jne .LBB15_57 +; AVX2-NEXT: .LBB15_58: # %else56 +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX2-NEXT: jne .LBB15_59 +; AVX2-NEXT: .LBB15_60: # %else58 +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX2-NEXT: jne .LBB15_61 +; AVX2-NEXT: .LBB15_62: # %else60 +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX2-NEXT: jne .LBB15_63 +; AVX2-NEXT: .LBB15_64: # %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB15_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB15_2: # %else -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB15_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB15_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB15_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB15_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB15_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB15_6: # %else4 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB15_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB15_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB15_8: # %else6 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB15_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB15_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB15_10: # %else8 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB15_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB15_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB15_12: # %else10 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB15_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB15_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB15_14: # %else12 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB15_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB15_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB15_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB15_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB15_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB15_18: # %else16 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB15_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB15_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB15_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB15_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB15_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB15_22: # %else20 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB15_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB15_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB15_24: # %else22 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB15_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB15_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB15_26: # %else24 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB15_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB15_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB15_28: # %else26 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB15_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB15_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB15_30: # %else28 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 -; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB15_34 -; AVX2-NEXT: # %bb.33: # %cond.store31 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB15_31 +; AVX2-NEXT: jmp .LBB15_32 +; AVX2-NEXT: .LBB15_33: # %cond.store31 ; AVX2-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB15_34: # %else32 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX2-NEXT: je .LBB15_36 -; AVX2-NEXT: # %bb.35: # %cond.store33 +; AVX2-NEXT: .LBB15_35: # %cond.store33 ; AVX2-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX2-NEXT: .LBB15_36: # %else34 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX2-NEXT: je .LBB15_38 -; AVX2-NEXT: # %bb.37: # %cond.store35 +; AVX2-NEXT: .LBB15_37: # %cond.store35 ; AVX2-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB15_38: # %else36 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX2-NEXT: je .LBB15_40 -; AVX2-NEXT: # %bb.39: # %cond.store37 +; AVX2-NEXT: .LBB15_39: # %cond.store37 ; AVX2-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX2-NEXT: .LBB15_40: # %else38 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX2-NEXT: je .LBB15_42 -; AVX2-NEXT: # %bb.41: # %cond.store39 +; AVX2-NEXT: .LBB15_41: # %cond.store39 ; AVX2-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB15_42: # %else40 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX2-NEXT: je .LBB15_44 -; AVX2-NEXT: # %bb.43: # %cond.store41 +; AVX2-NEXT: .LBB15_43: # %cond.store41 ; AVX2-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX2-NEXT: .LBB15_44: # %else42 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX2-NEXT: je .LBB15_46 -; AVX2-NEXT: # %bb.45: # %cond.store43 +; AVX2-NEXT: .LBB15_45: # %cond.store43 ; AVX2-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB15_46: # %else44 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX2-NEXT: je .LBB15_48 -; AVX2-NEXT: # %bb.47: # %cond.store45 +; AVX2-NEXT: .LBB15_47: # %cond.store45 ; AVX2-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX2-NEXT: .LBB15_48: # %else46 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX2-NEXT: je .LBB15_50 -; AVX2-NEXT: # %bb.49: # %cond.store47 +; AVX2-NEXT: .LBB15_49: # %cond.store47 ; AVX2-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB15_50: # %else48 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX2-NEXT: je .LBB15_52 -; AVX2-NEXT: # %bb.51: # %cond.store49 +; AVX2-NEXT: .LBB15_51: # %cond.store49 ; AVX2-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX2-NEXT: .LBB15_52: # %else50 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX2-NEXT: je .LBB15_54 -; AVX2-NEXT: # %bb.53: # %cond.store51 +; AVX2-NEXT: .LBB15_53: # %cond.store51 ; AVX2-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB15_54: # %else52 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX2-NEXT: je .LBB15_56 -; AVX2-NEXT: # %bb.55: # %cond.store53 +; AVX2-NEXT: .LBB15_55: # %cond.store53 ; AVX2-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX2-NEXT: .LBB15_56: # %else54 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX2-NEXT: je .LBB15_58 -; AVX2-NEXT: # %bb.57: # %cond.store55 +; AVX2-NEXT: .LBB15_57: # %cond.store55 ; AVX2-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB15_58: # %else56 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX2-NEXT: je .LBB15_60 -; AVX2-NEXT: # %bb.59: # %cond.store57 +; AVX2-NEXT: .LBB15_59: # %cond.store57 ; AVX2-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX2-NEXT: .LBB15_60: # %else58 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX2-NEXT: je .LBB15_62 -; AVX2-NEXT: # %bb.61: # %cond.store59 +; AVX2-NEXT: .LBB15_61: # %cond.store59 ; AVX2-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB15_62: # %else60 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX2-NEXT: je .LBB15_64 -; AVX2-NEXT: # %bb.63: # %cond.store61 +; AVX2-NEXT: .LBB15_63: # %cond.store61 ; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX2-NEXT: .LBB15_64: # %else62 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512F-NEXT: vpmaxsw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB15_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB15_3 +; AVX512F-NEXT: .LBB15_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB15_5 +; AVX512F-NEXT: .LBB15_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB15_7 +; AVX512F-NEXT: .LBB15_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB15_9 +; AVX512F-NEXT: .LBB15_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB15_11 +; AVX512F-NEXT: .LBB15_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB15_13 +; AVX512F-NEXT: .LBB15_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB15_15 +; AVX512F-NEXT: .LBB15_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB15_17 +; AVX512F-NEXT: .LBB15_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB15_19 +; AVX512F-NEXT: .LBB15_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB15_21 +; AVX512F-NEXT: .LBB15_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB15_23 +; AVX512F-NEXT: .LBB15_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB15_25 +; AVX512F-NEXT: .LBB15_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB15_27 +; AVX512F-NEXT: .LBB15_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB15_29 +; AVX512F-NEXT: .LBB15_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: je .LBB15_32 +; AVX512F-NEXT: .LBB15_31: # %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512F-NEXT: .LBB15_32: # %else30 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB15_33 +; AVX512F-NEXT: # %bb.34: # %else32 +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512F-NEXT: jne .LBB15_35 +; AVX512F-NEXT: .LBB15_36: # %else34 +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512F-NEXT: jne .LBB15_37 +; AVX512F-NEXT: .LBB15_38: # %else36 +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512F-NEXT: jne .LBB15_39 +; AVX512F-NEXT: .LBB15_40: # %else38 +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512F-NEXT: jne .LBB15_41 +; AVX512F-NEXT: .LBB15_42: # %else40 +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512F-NEXT: jne .LBB15_43 +; AVX512F-NEXT: .LBB15_44: # %else42 +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512F-NEXT: jne .LBB15_45 +; AVX512F-NEXT: .LBB15_46: # %else44 +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512F-NEXT: jne .LBB15_47 +; AVX512F-NEXT: .LBB15_48: # %else46 +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512F-NEXT: jne .LBB15_49 +; AVX512F-NEXT: .LBB15_50: # %else48 +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512F-NEXT: jne .LBB15_51 +; AVX512F-NEXT: .LBB15_52: # %else50 +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512F-NEXT: jne .LBB15_53 +; AVX512F-NEXT: .LBB15_54: # %else52 +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512F-NEXT: jne .LBB15_55 +; AVX512F-NEXT: .LBB15_56: # %else54 +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512F-NEXT: jne .LBB15_57 +; AVX512F-NEXT: .LBB15_58: # %else56 +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512F-NEXT: jne .LBB15_59 +; AVX512F-NEXT: .LBB15_60: # %else58 +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512F-NEXT: jne .LBB15_61 +; AVX512F-NEXT: .LBB15_62: # %else60 +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-NEXT: jne .LBB15_63 +; AVX512F-NEXT: .LBB15_64: # %else62 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB15_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB15_2: # %else -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB15_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB15_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB15_4: # %else2 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB15_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB15_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB15_6: # %else4 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB15_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB15_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB15_8: # %else6 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB15_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB15_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB15_10: # %else8 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB15_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB15_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB15_12: # %else10 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB15_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB15_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB15_14: # %else12 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB15_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB15_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB15_16: # %else14 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB15_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB15_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB15_18: # %else16 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 -; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB15_20: # %else18 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: je .LBB15_20 +; AVX512F-NEXT: .LBB15_19: # %cond.store17 +; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB15_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB15_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB15_22: # %else20 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB15_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB15_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB15_24: # %else22 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB15_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB15_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB15_26: # %else24 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB15_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB15_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB15_28: # %else26 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB15_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB15_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB15_30: # %else28 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 -; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB15_34 -; AVX512F-NEXT: # %bb.33: # %cond.store31 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB15_31 +; AVX512F-NEXT: jmp .LBB15_32 +; AVX512F-NEXT: .LBB15_33: # %cond.store31 ; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB15_34: # %else32 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX512F-NEXT: je .LBB15_36 -; AVX512F-NEXT: # %bb.35: # %cond.store33 +; AVX512F-NEXT: .LBB15_35: # %cond.store33 ; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX512F-NEXT: .LBB15_36: # %else34 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX512F-NEXT: je .LBB15_38 -; AVX512F-NEXT: # %bb.37: # %cond.store35 +; AVX512F-NEXT: .LBB15_37: # %cond.store35 ; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB15_38: # %else36 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX512F-NEXT: je .LBB15_40 -; AVX512F-NEXT: # %bb.39: # %cond.store37 +; AVX512F-NEXT: .LBB15_39: # %cond.store37 ; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX512F-NEXT: .LBB15_40: # %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX512F-NEXT: je .LBB15_42 -; AVX512F-NEXT: # %bb.41: # %cond.store39 +; AVX512F-NEXT: .LBB15_41: # %cond.store39 ; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB15_42: # %else40 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX512F-NEXT: je .LBB15_44 -; AVX512F-NEXT: # %bb.43: # %cond.store41 +; AVX512F-NEXT: .LBB15_43: # %cond.store41 ; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX512F-NEXT: .LBB15_44: # %else42 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX512F-NEXT: je .LBB15_46 -; AVX512F-NEXT: # %bb.45: # %cond.store43 +; AVX512F-NEXT: .LBB15_45: # %cond.store43 ; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB15_46: # %else44 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX512F-NEXT: je .LBB15_48 -; AVX512F-NEXT: # %bb.47: # %cond.store45 +; AVX512F-NEXT: .LBB15_47: # %cond.store45 ; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX512F-NEXT: .LBB15_48: # %else46 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX512F-NEXT: je .LBB15_50 -; AVX512F-NEXT: # %bb.49: # %cond.store47 +; AVX512F-NEXT: .LBB15_49: # %cond.store47 ; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB15_50: # %else48 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX512F-NEXT: je .LBB15_52 -; AVX512F-NEXT: # %bb.51: # %cond.store49 +; AVX512F-NEXT: .LBB15_51: # %cond.store49 ; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX512F-NEXT: .LBB15_52: # %else50 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX512F-NEXT: je .LBB15_54 -; AVX512F-NEXT: # %bb.53: # %cond.store51 +; AVX512F-NEXT: .LBB15_53: # %cond.store51 ; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB15_54: # %else52 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX512F-NEXT: je .LBB15_56 -; AVX512F-NEXT: # %bb.55: # %cond.store53 +; AVX512F-NEXT: .LBB15_55: # %cond.store53 ; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX512F-NEXT: .LBB15_56: # %else54 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX512F-NEXT: je .LBB15_58 -; AVX512F-NEXT: # %bb.57: # %cond.store55 +; AVX512F-NEXT: .LBB15_57: # %cond.store55 ; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB15_58: # %else56 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX512F-NEXT: je .LBB15_60 -; AVX512F-NEXT: # %bb.59: # %cond.store57 +; AVX512F-NEXT: .LBB15_59: # %cond.store57 ; AVX512F-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX512F-NEXT: .LBB15_60: # %else58 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX512F-NEXT: je .LBB15_62 -; AVX512F-NEXT: # %bb.61: # %cond.store59 +; AVX512F-NEXT: .LBB15_61: # %cond.store59 ; AVX512F-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB15_62: # %else60 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX512F-NEXT: je .LBB15_64 -; AVX512F-NEXT: # %bb.63: # %cond.store61 +; AVX512F-NEXT: .LBB15_63: # %cond.store61 ; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX512F-NEXT: .LBB15_64: # %else62 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7431,747 +6742,600 @@ ; SSE2-LABEL: truncstore_v16i16_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: packsswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: pmovmskb %xmm3, %eax +; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB16_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB16_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB16_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB16_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB16_3 ; SSE2-NEXT: .LBB16_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB16_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB16_5 ; SSE2-NEXT: .LBB16_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB16_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB16_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB16_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB16_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB16_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB16_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB16_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB16_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB16_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB16_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB16_26: # %else24 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB16_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: je .LBB16_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 +; SSE2-NEXT: jne .LBB16_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB16_31 +; SSE2-NEXT: .LBB16_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB16_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB16_4 +; SSE2-NEXT: .LBB16_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB16_6 +; SSE2-NEXT: .LBB16_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB16_7 +; SSE2-NEXT: jmp .LBB16_8 +; SSE2-NEXT: .LBB16_29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) -; SSE2-NEXT: .LBB16_30: # %else28 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB16_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB16_31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) -; SSE2-NEXT: .LBB16_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE4-NEXT: packsswb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: notb %al +; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE4-NEXT: pmovmskb %xmm3, %eax +; SSE4-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB16_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB16_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB16_3 +; SSE4-NEXT: .LBB16_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB16_5 +; SSE4-NEXT: .LBB16_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB16_7 +; SSE4-NEXT: .LBB16_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB16_9 +; SSE4-NEXT: .LBB16_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB16_11 +; SSE4-NEXT: .LBB16_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB16_13 +; SSE4-NEXT: .LBB16_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB16_15 +; SSE4-NEXT: .LBB16_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB16_17 +; SSE4-NEXT: .LBB16_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB16_19 +; SSE4-NEXT: .LBB16_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB16_21 +; SSE4-NEXT: .LBB16_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB16_23 +; SSE4-NEXT: .LBB16_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB16_25 +; SSE4-NEXT: .LBB16_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB16_27 +; SSE4-NEXT: .LBB16_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB16_29 +; SSE4-NEXT: .LBB16_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB16_31 +; SSE4-NEXT: .LBB16_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB16_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB16_2: # %else -; SSE4-NEXT: pextrb $1, %xmm3, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB16_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB16_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB16_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB16_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB16_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB16_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB16_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB16_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB16_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB16_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB16_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB16_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB16_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB16_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB16_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB16_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB16_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB16_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB16_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB16_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB16_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB16_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB16_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB16_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB16_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB16_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB16_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB16_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB16_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB16_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB16_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB16_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB16_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB16_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB16_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB16_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB16_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB16_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB16_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB16_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB16_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB16_30: # %else28 -; SSE4-NEXT: pextrb $15, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB16_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB16_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB16_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: notb %al +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB16_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB16_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB16_3 +; AVX1-NEXT: .LBB16_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB16_5 +; AVX1-NEXT: .LBB16_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB16_7 +; AVX1-NEXT: .LBB16_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB16_9 +; AVX1-NEXT: .LBB16_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB16_11 +; AVX1-NEXT: .LBB16_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB16_13 +; AVX1-NEXT: .LBB16_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB16_15 +; AVX1-NEXT: .LBB16_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB16_17 +; AVX1-NEXT: .LBB16_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB16_19 +; AVX1-NEXT: .LBB16_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB16_21 +; AVX1-NEXT: .LBB16_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB16_23 +; AVX1-NEXT: .LBB16_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB16_25 +; AVX1-NEXT: .LBB16_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB16_27 +; AVX1-NEXT: .LBB16_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB16_29 +; AVX1-NEXT: .LBB16_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB16_31 +; AVX1-NEXT: .LBB16_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB16_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB16_2: # %else -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB16_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB16_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB16_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB16_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB16_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB16_6: # %else4 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB16_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB16_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB16_8: # %else6 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB16_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB16_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB16_10: # %else8 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB16_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB16_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB16_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB16_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB16_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB16_14: # %else12 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB16_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB16_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB16_16: # %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB16_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB16_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB16_18: # %else16 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB16_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB16_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB16_20: # %else18 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB16_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB16_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB16_22: # %else20 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB16_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB16_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB16_24: # %else22 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB16_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB16_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB16_26: # %else24 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB16_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB16_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB16_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB16_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB16_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB16_30: # %else28 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB16_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB16_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB16_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB16_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB16_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB16_3 +; AVX2-NEXT: .LBB16_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB16_5 +; AVX2-NEXT: .LBB16_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB16_7 +; AVX2-NEXT: .LBB16_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB16_9 +; AVX2-NEXT: .LBB16_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB16_11 +; AVX2-NEXT: .LBB16_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB16_13 +; AVX2-NEXT: .LBB16_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB16_15 +; AVX2-NEXT: .LBB16_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB16_17 +; AVX2-NEXT: .LBB16_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB16_19 +; AVX2-NEXT: .LBB16_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB16_21 +; AVX2-NEXT: .LBB16_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB16_23 +; AVX2-NEXT: .LBB16_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB16_25 +; AVX2-NEXT: .LBB16_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB16_27 +; AVX2-NEXT: .LBB16_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB16_29 +; AVX2-NEXT: .LBB16_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB16_31 +; AVX2-NEXT: .LBB16_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB16_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB16_2: # %else -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB16_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB16_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB16_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB16_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB16_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB16_6: # %else4 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB16_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB16_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB16_8: # %else6 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB16_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB16_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB16_10: # %else8 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB16_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 -; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB16_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: je .LBB16_12 +; AVX2-NEXT: .LBB16_11: # %cond.store9 +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB16_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB16_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB16_14: # %else12 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB16_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB16_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB16_16: # %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB16_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB16_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB16_18: # %else16 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB16_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB16_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB16_20: # %else18 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB16_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB16_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB16_22: # %else20 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB16_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB16_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB16_24: # %else22 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB16_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB16_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB16_26: # %else24 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB16_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB16_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB16_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB16_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB16_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB16_30: # %else28 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB16_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB16_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB16_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v16i16_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %xmm1, %eax +; AVX512F-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB16_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB16_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB16_3 +; AVX512F-NEXT: .LBB16_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB16_5 +; AVX512F-NEXT: .LBB16_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB16_7 +; AVX512F-NEXT: .LBB16_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB16_9 +; AVX512F-NEXT: .LBB16_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB16_11 +; AVX512F-NEXT: .LBB16_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB16_13 +; AVX512F-NEXT: .LBB16_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB16_15 +; AVX512F-NEXT: .LBB16_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB16_17 +; AVX512F-NEXT: .LBB16_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB16_19 +; AVX512F-NEXT: .LBB16_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB16_21 +; AVX512F-NEXT: .LBB16_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB16_23 +; AVX512F-NEXT: .LBB16_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB16_25 +; AVX512F-NEXT: .LBB16_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB16_27 +; AVX512F-NEXT: .LBB16_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB16_29 +; AVX512F-NEXT: .LBB16_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB16_31 +; AVX512F-NEXT: .LBB16_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB16_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB16_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB16_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB16_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB16_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB16_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB16_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB16_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB16_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB16_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB16_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB16_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB16_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB16_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB16_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB16_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB16_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB16_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB16_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB16_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB16_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB16_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB16_16: # %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB16_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB16_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB16_18: # %else16 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB16_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB16_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB16_20: # %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB16_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB16_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB16_22: # %else20 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB16_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB16_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB16_24: # %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB16_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB16_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB16_26: # %else24 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB16_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB16_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB16_28: # %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB16_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB16_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB16_30: # %else28 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB16_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB16_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB16_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -8209,316 +7373,278 @@ ; SSE2-LABEL: truncstore_v8i16_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB17_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB17_2: # %else -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB17_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB17_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: packsswb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB17_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB17_3 +; SSE2-NEXT: .LBB17_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB17_5 ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB17_7 ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB17_9 ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB17_11 ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB17_15 +; SSE2-NEXT: .LBB17_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB17_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB17_4 +; SSE2-NEXT: .LBB17_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB17_6 +; SSE2-NEXT: .LBB17_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB17_8 +; SSE2-NEXT: .LBB17_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB17_10 +; SSE2-NEXT: .LBB17_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB17_12 +; SSE2-NEXT: .LBB17_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB17_14 +; SSE2-NEXT: .LBB17_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB17_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB17_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i16_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminsw {{.*}}(%rip), %xmm0 ; SSE4-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: packsswb %xmm0, %xmm1 +; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB17_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB17_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB17_3 +; SSE4-NEXT: .LBB17_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB17_5 +; SSE4-NEXT: .LBB17_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB17_7 +; SSE4-NEXT: .LBB17_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB17_9 +; SSE4-NEXT: .LBB17_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB17_11 +; SSE4-NEXT: .LBB17_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB17_13 +; SSE4-NEXT: .LBB17_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB17_15 +; SSE4-NEXT: .LBB17_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB17_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB17_2: # %else -; SSE4-NEXT: pextrb $2, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB17_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB17_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB17_4: # %else2 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB17_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB17_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB17_6: # %else4 -; SSE4-NEXT: pextrb $6, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB17_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB17_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB17_8: # %else6 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB17_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB17_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB17_10: # %else8 -; SSE4-NEXT: pextrb $10, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB17_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB17_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB17_12: # %else10 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB17_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB17_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB17_14: # %else12 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB17_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB17_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v8i16_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpmovmskb %xmm1, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB17_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB17_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB17_3 +; AVX-NEXT: .LBB17_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB17_5 +; AVX-NEXT: .LBB17_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB17_7 +; AVX-NEXT: .LBB17_8: # %else6 +; AVX-NEXT: testb $16, %al +; AVX-NEXT: jne .LBB17_9 +; AVX-NEXT: .LBB17_10: # %else8 +; AVX-NEXT: testb $32, %al +; AVX-NEXT: jne .LBB17_11 +; AVX-NEXT: .LBB17_12: # %else10 +; AVX-NEXT: testb $64, %al +; AVX-NEXT: jne .LBB17_13 +; AVX-NEXT: .LBB17_14: # %else12 +; AVX-NEXT: testb $-128, %al +; AVX-NEXT: jne .LBB17_15 +; AVX-NEXT: .LBB17_16: # %else14 +; AVX-NEXT: retq +; AVX-NEXT: .LBB17_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB17_2: # %else -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB17_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB17_3: # %cond.store1 ; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB17_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB17_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB17_5: # %cond.store3 ; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB17_6: # %else4 -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB17_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB17_7: # %cond.store5 ; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB17_8: # %else6 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $16, %al ; AVX-NEXT: je .LBB17_10 -; AVX-NEXT: # %bb.9: # %cond.store7 +; AVX-NEXT: .LBB17_9: # %cond.store7 ; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB17_10: # %else8 -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $32, %al ; AVX-NEXT: je .LBB17_12 -; AVX-NEXT: # %bb.11: # %cond.store9 +; AVX-NEXT: .LBB17_11: # %cond.store9 ; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX-NEXT: .LBB17_12: # %else10 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $64, %al ; AVX-NEXT: je .LBB17_14 -; AVX-NEXT: # %bb.13: # %cond.store11 +; AVX-NEXT: .LBB17_13: # %cond.store11 ; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB17_14: # %else12 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $-128, %al ; AVX-NEXT: je .LBB17_16 -; AVX-NEXT: # %bb.15: # %cond.store13 +; AVX-NEXT: .LBB17_15: # %cond.store13 ; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v8i16_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB17_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB17_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB17_3 +; AVX512F-NEXT: .LBB17_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB17_5 +; AVX512F-NEXT: .LBB17_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB17_7 +; AVX512F-NEXT: .LBB17_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB17_9 +; AVX512F-NEXT: .LBB17_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB17_11 +; AVX512F-NEXT: .LBB17_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB17_13 +; AVX512F-NEXT: .LBB17_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB17_15 +; AVX512F-NEXT: .LBB17_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB17_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB17_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB17_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB17_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB17_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB17_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB17_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB17_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB17_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB17_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB17_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB17_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB17_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB17_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB17_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB17_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB17_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB17_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB17_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB17_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB17_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB17_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/masked_store_trunc_usat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_store_trunc_usat.ll +++ llvm/trunk/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -11,233 +11,220 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm6, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: packssdw %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm12, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm14 -; SSE2-NEXT: por %xmm0, %xmm14 -; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm6[0,2] +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm13 +; SSE2-NEXT: por %xmm0, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm7[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm13, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm12 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] -; SSE2-NEXT: movd %xmm9, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movss %xmm14, (%rdi) +; SSE2-NEXT: movss %xmm13, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm11, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: psrlq $16, %xmm12 -; SSE2-NEXT: movd %xmm12, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,2,3] ; SSE2-NEXT: movd %xmm1, 4(%rdi) ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pextrw $4, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,0,1] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB0_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB0_9 +; SSE2-NEXT: # %bb.10: # %else8 +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB0_11 +; SSE2-NEXT: .LBB0_12: # %else10 +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB0_13 +; SSE2-NEXT: .LBB0_14: # %else12 +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB0_15 +; SSE2-NEXT: .LBB0_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB0_9: # %cond.store7 ; SSE2-NEXT: movss %xmm2, 16(%rdi) -; SSE2-NEXT: .LBB0_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB0_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 +; SSE2-NEXT: .LBB0_11: # %cond.store9 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE2-NEXT: movd %xmm0, 20(%rdi) -; SSE2-NEXT: .LBB0_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm1, 24(%rdi) -; SSE2-NEXT: .LBB0_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB0_13: # %cond.store11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 24(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB0_15: # %cond.store13 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 28(%rdi) -; SSE2-NEXT: .LBB0_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE4-NEXT: pxor %xmm0, %xmm10 -; SSE4-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] +; SSE4-NEXT: pxor %xmm9, %xmm9 +; SSE4-NEXT: movapd {{.*#+}} xmm10 = [4294967295,4294967295] ; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm6 ; SSE4-NEXT: pxor %xmm11, %xmm6 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372041149743103,9223372041149743103] ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm6 +; SSE4-NEXT: movapd %xmm10, %xmm6 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE4-NEXT: movdqa %xmm8, %xmm1 ; SSE4-NEXT: pxor %xmm11, %xmm1 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm1 +; SSE4-NEXT: movapd %xmm10, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] ; SSE4-NEXT: movdqa %xmm3, %xmm6 ; SSE4-NEXT: pxor %xmm11, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm8 +; SSE4-NEXT: movapd %xmm10, %xmm8 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 ; SSE4-NEXT: pxor %xmm2, %xmm11 ; SSE4-NEXT: pcmpgtq %xmm11, %xmm7 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE4-NEXT: pextrb $0, %xmm10, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: .LBB0_2: # %else -; SSE4-NEXT: pextrb $4, %xmm10, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) -; SSE4-NEXT: .LBB0_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB0_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB0_3 +; SSE4-NEXT: .LBB0_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB0_5 ; SSE4-NEXT: .LBB0_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB0_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB0_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) ; SSE4-NEXT: .LBB0_8: # %else6 -; SSE4-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] -; SSE4-NEXT: xorps %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: movss %xmm9, 16(%rdi) -; SSE4-NEXT: .LBB0_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: extractps $1, %xmm9, 20(%rdi) +; SSE4-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB0_9 +; SSE4-NEXT: # %bb.10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB0_11 ; SSE4-NEXT: .LBB0_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 -; SSE4-NEXT: extractps $2, %xmm9, 24(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB0_13 ; SSE4-NEXT: .LBB0_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB0_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 -; SSE4-NEXT: extractps $3, %xmm9, 28(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB0_15 ; SSE4-NEXT: .LBB0_16: # %else14 ; SSE4-NEXT: retq +; SSE4-NEXT: .LBB0_1: # %cond.store +; SSE4-NEXT: movss %xmm1, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB0_4 +; SSE4-NEXT: .LBB0_3: # %cond.store1 +; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB0_6 +; SSE4-NEXT: .LBB0_5: # %cond.store3 +; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB0_7 +; SSE4-NEXT: jmp .LBB0_8 +; SSE4-NEXT: .LBB0_9: # %cond.store7 +; SSE4-NEXT: movss %xmm10, 16(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je .LBB0_12 +; SSE4-NEXT: .LBB0_11: # %cond.store9 +; SSE4-NEXT: extractps $1, %xmm10, 20(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: je .LBB0_14 +; SSE4-NEXT: .LBB0_13: # %cond.store11 +; SSE4-NEXT: extractps $2, %xmm10, 24(%rdi) +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: je .LBB0_16 +; SSE4-NEXT: .LBB0_15: # %cond.store13 +; SSE4-NEXT: extractps $3, %xmm10, 28(%rdi) +; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i32: ; AVX1: # %bb.0: @@ -333,65 +320,60 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: packssdw %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm12, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm14 -; SSE2-NEXT: por %xmm2, %xmm14 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm13 +; SSE2-NEXT: por %xmm2, %xmm13 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -400,89 +382,87 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movd %xmm9, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB1_2: # %else -; SSE2-NEXT: psrlq $16, %xmm10 -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB1_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB1_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB1_3 +; SSE2-NEXT: .LBB1_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB1_5 ; SSE2-NEXT: .LBB1_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB1_7 ; SSE2-NEXT: .LBB1_8: # %else6 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB1_9 ; SSE2-NEXT: .LBB1_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB1_11 ; SSE2-NEXT: .LBB1_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB1_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB1_13 ; SSE2-NEXT: .LBB1_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB1_15 +; SSE2-NEXT: .LBB1_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB1_1: # %cond.store +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB1_4 +; SSE2-NEXT: .LBB1_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB1_6 +; SSE2-NEXT: .LBB1_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB1_8 +; SSE2-NEXT: .LBB1_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB1_10 +; SSE2-NEXT: .LBB1_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB1_12 +; SSE2-NEXT: .LBB1_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB1_14 +; SSE2-NEXT: .LBB1_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB1_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB1_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB1_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE4-NEXT: pxor %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm7 @@ -492,12 +472,12 @@ ; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE4-NEXT: movapd %xmm6, %xmm7 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm9, %xmm1 ; SSE4-NEXT: pxor %xmm10, %xmm1 ; SSE4-NEXT: movdqa %xmm11, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movapd %xmm6, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE4-NEXT: packusdw %xmm7, %xmm1 ; SSE4-NEXT: movdqa %xmm3, %xmm7 ; SSE4-NEXT: pxor %xmm10, %xmm7 @@ -511,246 +491,234 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: packusdw %xmm7, %xmm6 ; SSE4-NEXT: packusdw %xmm6, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB1_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB1_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB1_3 +; SSE4-NEXT: .LBB1_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB1_5 +; SSE4-NEXT: .LBB1_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB1_7 +; SSE4-NEXT: .LBB1_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB1_9 +; SSE4-NEXT: .LBB1_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB1_11 +; SSE4-NEXT: .LBB1_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB1_13 +; SSE4-NEXT: .LBB1_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB1_15 +; SSE4-NEXT: .LBB1_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB1_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB1_2: # %else -; SSE4-NEXT: pextrb $4, %xmm9, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB1_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB1_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi) -; SSE4-NEXT: .LBB1_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB1_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB1_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi) -; SSE4-NEXT: .LBB1_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB1_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB1_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi) -; SSE4-NEXT: .LBB1_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB1_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB1_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi) -; SSE4-NEXT: .LBB1_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB1_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB1_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi) -; SSE4-NEXT: .LBB1_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB1_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB1_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi) -; SSE4-NEXT: .LBB1_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB1_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB1_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi) -; SSE4-NEXT: .LBB1_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm12 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [65535,65535] -; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm9 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: vblendvpd %xmm3, %xmm7, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm12, %xmm4, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm11, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm10, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB1_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB1_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB1_3 +; AVX1-NEXT: .LBB1_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB1_5 +; AVX1-NEXT: .LBB1_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB1_7 +; AVX1-NEXT: .LBB1_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB1_9 +; AVX1-NEXT: .LBB1_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB1_11 +; AVX1-NEXT: .LBB1_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB1_13 +; AVX1-NEXT: .LBB1_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB1_15 +; AVX1-NEXT: .LBB1_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB1_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB1_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB1_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB1_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB1_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB1_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB1_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB1_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB1_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB1_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB1_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB1_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB1_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB1_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB1_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB1_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB1_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB1_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB1_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB1_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB1_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm7, %ymm1, %ymm8 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] -; AVX2-NEXT: vpcmpgtq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vblendvpd %ymm8, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpcmpgtq %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB1_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB1_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB1_3 +; AVX2-NEXT: .LBB1_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB1_5 +; AVX2-NEXT: .LBB1_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB1_7 +; AVX2-NEXT: .LBB1_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB1_9 +; AVX2-NEXT: .LBB1_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB1_11 +; AVX2-NEXT: .LBB1_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB1_13 +; AVX2-NEXT: .LBB1_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB1_15 +; AVX2-NEXT: .LBB1_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB1_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB1_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB1_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB1_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB1_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB1_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB1_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB1_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB1_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB1_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB1_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB1_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB1_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB1_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB1_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB1_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB1_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB1_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB1_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB1_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB1_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -761,66 +729,61 @@ ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB1_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB1_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB1_3 +; AVX512F-NEXT: .LBB1_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB1_5 +; AVX512F-NEXT: .LBB1_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB1_7 +; AVX512F-NEXT: .LBB1_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB1_9 +; AVX512F-NEXT: .LBB1_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB1_11 +; AVX512F-NEXT: .LBB1_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB1_13 +; AVX512F-NEXT: .LBB1_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB1_15 +; AVX512F-NEXT: .LBB1_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB1_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB1_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB1_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB1_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB1_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB1_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB1_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB1_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB1_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB1_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB1_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB1_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB1_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB1_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB1_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB1_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB1_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB1_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB1_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB1_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB1_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB1_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB1_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -851,148 +814,141 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: packssdw %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm11, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm10, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm6, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm12 +; SSE2-NEXT: pxor %xmm2, %xmm11 ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm6 -; SSE2-NEXT: movd %xmm9, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm6, %eax -; SSE2-NEXT: je .LBB2_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB2_2: # %else -; SSE2-NEXT: psrlq $16, %xmm11 -; SSE2-NEXT: movd %xmm11, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB2_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB2_4: # %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm0, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm0, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm6, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm7, %ecx +; SSE2-NEXT: jne .LBB2_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB2_3 +; SSE2-NEXT: .LBB2_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB2_5 ; SSE2-NEXT: .LBB2_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm6, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB2_7 ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm6, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB2_9 ; SSE2-NEXT: .LBB2_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm6, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB2_11 ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB2_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: .LBB2_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB2_15 +; SSE2-NEXT: .LBB2_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB2_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB2_4 +; SSE2-NEXT: .LBB2_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB2_6 +; SSE2-NEXT: .LBB2_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm7, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB2_8 +; SSE2-NEXT: .LBB2_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm7, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB2_10 +; SSE2-NEXT: .LBB2_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm7, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB2_12 +; SSE2-NEXT: .LBB2_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm7, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB2_14 +; SSE2-NEXT: .LBB2_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm7, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB2_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm6, %eax +; SSE2-NEXT: .LBB2_15: # %cond.store13 +; SSE2-NEXT: pextrw $7, %xmm7, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB2_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE4-NEXT: pxor %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movapd {{.*#+}} xmm6 = [255,255] ; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm7 @@ -1002,12 +958,12 @@ ; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE4-NEXT: movapd %xmm6, %xmm7 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm9, %xmm1 ; SSE4-NEXT: pxor %xmm10, %xmm1 ; SSE4-NEXT: movdqa %xmm11, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movapd %xmm6, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE4-NEXT: packusdw %xmm7, %xmm1 ; SSE4-NEXT: movdqa %xmm3, %xmm7 ; SSE4-NEXT: pxor %xmm10, %xmm7 @@ -1021,246 +977,234 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: packusdw %xmm7, %xmm6 ; SSE4-NEXT: packusdw %xmm6, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB2_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm0, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm0, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm0, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB2_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB2_3 +; SSE4-NEXT: .LBB2_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB2_5 +; SSE4-NEXT: .LBB2_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB2_7 +; SSE4-NEXT: .LBB2_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB2_9 +; SSE4-NEXT: .LBB2_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB2_11 +; SSE4-NEXT: .LBB2_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB2_13 +; SSE4-NEXT: .LBB2_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB2_15 +; SSE4-NEXT: .LBB2_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB2_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB2_2: # %else -; SSE4-NEXT: pextrb $4, %xmm9, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB2_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB2_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm1, 1(%rdi) -; SSE4-NEXT: .LBB2_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm4, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB2_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB2_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm1, 2(%rdi) -; SSE4-NEXT: .LBB2_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB2_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB2_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm1, 3(%rdi) -; SSE4-NEXT: .LBB2_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB2_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB2_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm1, 4(%rdi) -; SSE4-NEXT: .LBB2_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB2_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB2_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm1, 5(%rdi) -; SSE4-NEXT: .LBB2_12: # %else10 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB2_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB2_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm1, 6(%rdi) -; SSE4-NEXT: .LBB2_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB2_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB2_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm1, 7(%rdi) -; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm12 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm9 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [255,255] +; AVX1-NEXT: vblendvpd %xmm3, %xmm7, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm12, %xmm4, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm11, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm10, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB2_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB2_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB2_3 +; AVX1-NEXT: .LBB2_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB2_5 +; AVX1-NEXT: .LBB2_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB2_7 +; AVX1-NEXT: .LBB2_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB2_9 +; AVX1-NEXT: .LBB2_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB2_11 +; AVX1-NEXT: .LBB2_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB2_13 +; AVX1-NEXT: .LBB2_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB2_15 +; AVX1-NEXT: .LBB2_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB2_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB2_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB2_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB2_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB2_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB2_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB2_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB2_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB2_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB2_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB2_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB2_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB2_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB2_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB2_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB2_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB2_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB2_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB2_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB2_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB2_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm7, %ymm1, %ymm8 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] -; AVX2-NEXT: vpcmpgtq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vblendvpd %ymm8, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpcmpgtq %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vblendvpd %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm5, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB2_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB2_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB2_3 +; AVX2-NEXT: .LBB2_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB2_5 +; AVX2-NEXT: .LBB2_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB2_7 +; AVX2-NEXT: .LBB2_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB2_9 +; AVX2-NEXT: .LBB2_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB2_11 +; AVX2-NEXT: .LBB2_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB2_13 +; AVX2-NEXT: .LBB2_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB2_15 +; AVX2-NEXT: .LBB2_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB2_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB2_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB2_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB2_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB2_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB2_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB2_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB2_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB2_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB2_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB2_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB2_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB2_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB2_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB2_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB2_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB2_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB2_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB2_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB2_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB2_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1272,66 +1216,61 @@ ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB2_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB2_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB2_3 +; AVX512F-NEXT: .LBB2_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB2_5 +; AVX512F-NEXT: .LBB2_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB2_7 +; AVX512F-NEXT: .LBB2_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB2_9 +; AVX512F-NEXT: .LBB2_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB2_11 +; AVX512F-NEXT: .LBB2_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB2_13 +; AVX512F-NEXT: .LBB2_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB2_15 +; AVX512F-NEXT: .LBB2_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB2_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB2_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB2_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB2_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB2_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB2_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB2_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB2_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB2_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB2_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB2_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB2_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB2_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB2_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB2_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB2_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB2_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB2_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB2_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB2_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB2_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB2_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB2_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1363,7 +1302,6 @@ ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -1393,50 +1331,45 @@ ; SSE2-NEXT: pandn %xmm8, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB3_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: jne .LBB3_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB3_3 +; SSE2-NEXT: .LBB3_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB3_5 +; SSE2-NEXT: .LBB3_6: # %else4 +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB3_7 +; SSE2-NEXT: .LBB3_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB3_1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) -; SSE2-NEXT: .LBB3_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB3_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm3, 4(%rdi) -; SSE2-NEXT: .LBB3_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: .LBB3_3: # %cond.store1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 +; SSE2-NEXT: .LBB3_5: # %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 8(%rdi) -; SSE2-NEXT: .LBB3_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB3_7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) -; SSE2-NEXT: .LBB3_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1451,34 +1384,36 @@ ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 ; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB3_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE4-NEXT: movmskps %xmm6, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB3_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB3_3 +; SSE4-NEXT: .LBB3_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB3_5 +; SSE4-NEXT: .LBB3_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB3_7 +; SSE4-NEXT: .LBB3_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB3_1: # %cond.store ; SSE4-NEXT: movss %xmm5, (%rdi) -; SSE4-NEXT: .LBB3_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB3_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB3_3: # %cond.store1 ; SSE4-NEXT: extractps $1, %xmm5, 4(%rdi) -; SSE4-NEXT: .LBB3_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB3_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB3_5: # %cond.store3 ; SSE4-NEXT: extractps $2, %xmm5, 8(%rdi) -; SSE4-NEXT: .LBB3_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB3_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB3_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm5, 12(%rdi) -; SSE4-NEXT: .LBB3_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i32: @@ -1567,7 +1502,6 @@ ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -1597,51 +1531,46 @@ ; SSE2-NEXT: pandn %xmm8, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB4_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB4_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB4_3 ; SSE2-NEXT: .LBB4_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB4_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB4_5 ; SSE2-NEXT: .LBB4_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB4_7 +; SSE2-NEXT: .LBB4_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB4_1: # %cond.store +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB4_4 +; SSE2-NEXT: .LBB4_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB4_6 +; SSE2-NEXT: .LBB4_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB4_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB4_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm1, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB4_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1656,42 +1585,41 @@ ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 ; SSE4-NEXT: packusdw %xmm3, %xmm5 -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB4_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE4-NEXT: movmskps %xmm6, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB4_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB4_3 +; SSE4-NEXT: .LBB4_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB4_5 +; SSE4-NEXT: .LBB4_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB4_7 +; SSE4-NEXT: .LBB4_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB4_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm5, (%rdi) -; SSE4-NEXT: .LBB4_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB4_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB4_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm5, 2(%rdi) -; SSE4-NEXT: .LBB4_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB4_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB4_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm5, 4(%rdi) -; SSE4-NEXT: .LBB4_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB4_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB4_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm5, 6(%rdi) -; SSE4-NEXT: .LBB4_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] @@ -1703,43 +1631,43 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB4_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB4_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB4_3 +; AVX1-NEXT: .LBB4_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB4_5 +; AVX1-NEXT: .LBB4_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB4_7 +; AVX1-NEXT: .LBB4_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB4_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB4_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB4_3: # %cond.store1 ; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB4_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB4_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB4_5: # %cond.store3 ; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB4_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB4_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB4_7: # %cond.store5 ; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB4_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i64_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 @@ -1748,34 +1676,37 @@ ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB4_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB4_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB4_3 +; AVX2-NEXT: .LBB4_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB4_5 +; AVX2-NEXT: .LBB4_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB4_7 +; AVX2-NEXT: .LBB4_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB4_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB4_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB4_3: # %cond.store1 ; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB4_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB4_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB4_5: # %cond.store3 ; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB4_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB4_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB4_7: # %cond.store5 ; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB4_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1784,39 +1715,38 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB4_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB4_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB4_3 +; AVX512F-NEXT: .LBB4_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB4_5 +; AVX512F-NEXT: .LBB4_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB4_7 +; AVX512F-NEXT: .LBB4_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB4_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB4_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB4_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB4_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB4_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB4_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB4_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB4_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB4_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB4_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB4_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1854,7 +1784,6 @@ ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -1884,51 +1813,46 @@ ; SSE2-NEXT: pandn %xmm8, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB5_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: jne .LBB5_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB5_3 ; SSE2-NEXT: .LBB5_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB5_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB5_5 ; SSE2-NEXT: .LBB5_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB5_7 +; SSE2-NEXT: .LBB5_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB5_1: # %cond.store +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB5_4 +; SSE2-NEXT: .LBB5_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB5_6 +; SSE2-NEXT: .LBB5_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB5_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB5_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm1, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB5_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm8 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [255,255] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1943,42 +1867,41 @@ ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 ; SSE4-NEXT: packusdw %xmm3, %xmm5 -; SSE4-NEXT: pextrb $0, %xmm6, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB5_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE4-NEXT: movmskps %xmm6, %eax +; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB5_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB5_3 +; SSE4-NEXT: .LBB5_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB5_5 +; SSE4-NEXT: .LBB5_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB5_7 +; SSE4-NEXT: .LBB5_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB5_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm5, (%rdi) -; SSE4-NEXT: .LBB5_2: # %else -; SSE4-NEXT: pextrb $4, %xmm6, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB5_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB5_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm5, 1(%rdi) -; SSE4-NEXT: .LBB5_4: # %else2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB5_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB5_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm5, 2(%rdi) -; SSE4-NEXT: .LBB5_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB5_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB5_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm5, 3(%rdi) -; SSE4-NEXT: .LBB5_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] @@ -1990,43 +1913,43 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB5_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB5_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB5_3 +; AVX1-NEXT: .LBB5_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB5_5 +; AVX1-NEXT: .LBB5_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB5_7 +; AVX1-NEXT: .LBB5_8: # %else6 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB5_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB5_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB5_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB5_3: # %cond.store1 ; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB5_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB5_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB5_5: # %cond.store3 ; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB5_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB5_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB5_7: # %cond.store5 ; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB5_8: # %else6 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i64_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 @@ -2035,34 +1958,37 @@ ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB5_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB5_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB5_3 +; AVX2-NEXT: .LBB5_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB5_5 +; AVX2-NEXT: .LBB5_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB5_7 +; AVX2-NEXT: .LBB5_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB5_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB5_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB5_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB5_3: # %cond.store1 ; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB5_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB5_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB5_5: # %cond.store3 ; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB5_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB5_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB5_7: # %cond.store5 ; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB5_8: # %else6 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2071,39 +1997,38 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB5_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB5_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB5_3 +; AVX512F-NEXT: .LBB5_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB5_5 +; AVX512F-NEXT: .LBB5_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB5_7 +; AVX512F-NEXT: .LBB5_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB5_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB5_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB5_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB5_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB5_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB5_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB5_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB5_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB5_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB5_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB5_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2140,68 +2065,69 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, <2 x i32>* %p, <2 x i64> %mask) { ; SSE2-LABEL: truncstore_v2i64_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB6_2 -; SSE2-NEXT: # %bb.1: # %cond.store +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB6_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB6_3 +; SSE2-NEXT: .LBB6_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB6_1: # %cond.store ; SSE2-NEXT: movd %xmm2, (%rdi) -; SSE2-NEXT: .LBB6_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB6_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB6_3: # %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movd %xmm0, 4(%rdi) -; SSE2-NEXT: .LBB6_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm3 -; SSE4-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm2, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: pxor %xmm0, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372041149743103,9223372041149743103] -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB6_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: movss %xmm1, (%rdi) -; SSE4-NEXT: .LBB6_2: # %else -; SSE4-NEXT: pextrb $8, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB6_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi) +; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm4 +; SSE4-NEXT: movmskpd %xmm4, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB6_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB6_3 ; SSE4-NEXT: .LBB6_4: # %else2 ; SSE4-NEXT: retq +; SSE4-NEXT: .LBB6_1: # %cond.store +; SSE4-NEXT: movss %xmm3, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB6_4 +; SSE4-NEXT: .LBB6_3: # %cond.store1 +; SSE4-NEXT: extractps $2, %xmm3, 4(%rdi) +; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v2i64_v2i32: ; AVX1: # %bb.0: @@ -2280,93 +2206,95 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, <2 x i16>* %p, <2 x i64> %mask) { ; SSE2-LABEL: truncstore_v2i64_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB7_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB7_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB7_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB7_3 +; SSE2-NEXT: .LBB7_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB7_1: # %cond.store +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB7_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB7_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: movw %ax, 2(%rdi) -; SSE2-NEXT: .LBB7_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm3 -; SSE4-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm2, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: pxor %xmm0, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854841343,9223372036854841343] -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB7_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB7_2: # %else -; SSE4-NEXT: pextrb $8, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB7_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $4, %xmm1, 2(%rdi) +; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm4 +; SSE4-NEXT: movmskpd %xmm4, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB7_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB7_3 ; SSE4-NEXT: .LBB7_4: # %else2 ; SSE4-NEXT: retq +; SSE4-NEXT: .LBB7_1: # %cond.store +; SSE4-NEXT: pextrw $0, %xmm3, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB7_4 +; SSE4-NEXT: .LBB7_3: # %cond.store1 +; SSE4-NEXT: pextrw $4, %xmm3, 2(%rdi) +; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm4 +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovapd {{.*#+}} xmm2 = [65535,65535] -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB7_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB7_3 +; AVX-NEXT: .LBB7_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB7_1: # %cond.store ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB7_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB7_3: # %cond.store1 ; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB7_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: @@ -2374,22 +2302,23 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535] -; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB7_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB7_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB7_3 +; AVX512F-NEXT: .LBB7_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB7_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB7_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB7_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB7_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB7_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2425,93 +2354,95 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, <2 x i8>* %p, <2 x i64> %mask) { ; SSE2-LABEL: truncstore_v2i64_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB8_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB8_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: jne .LBB8_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB8_3 +; SSE2-NEXT: .LBB8_4: # %else2 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB8_1: # %cond.store +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB8_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 +; SSE2-NEXT: .LBB8_3: # %cond.store1 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB8_4: # %else2 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v2i64_v2i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE4-NEXT: pxor %xmm0, %xmm3 -; SSE4-NEXT: movapd {{.*#+}} xmm1 = [255,255] -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm2, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: pxor %xmm0, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854776063,9223372036854776063] -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB8_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) -; SSE4-NEXT: .LBB8_2: # %else -; SSE4-NEXT: pextrb $8, %xmm3, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB8_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $8, %xmm1, 1(%rdi) +; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pcmpeqq %xmm1, %xmm4 +; SSE4-NEXT: movmskpd %xmm4, %eax +; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB8_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB8_3 ; SSE4-NEXT: .LBB8_4: # %else2 ; SSE4-NEXT: retq +; SSE4-NEXT: .LBB8_1: # %cond.store +; SSE4-NEXT: pextrb $0, %xmm3, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB8_4 +; SSE4-NEXT: .LBB8_3: # %cond.store1 +; SSE4-NEXT: pextrb $8, %xmm3, 1(%rdi) +; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v2i64_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255] +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm4 +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovapd {{.*#+}} xmm2 = [255,255] -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB8_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB8_3 +; AVX-NEXT: .LBB8_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB8_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB8_2: # %else -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB8_3: # %cond.store1 ; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB8_4: # %else2 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: @@ -2519,22 +2450,23 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] -; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB8_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB8_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB8_3 +; AVX512F-NEXT: .LBB8_4: # %else2 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB8_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB8_2: # %else -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB8_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB8_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB8_4: # %else2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2570,14 +2502,13 @@ ; SSE2-LABEL: truncstore_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm12 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm11, %xmm13 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm11, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pxor %xmm9, %xmm8 ; SSE2-NEXT: por %xmm0, %xmm8 @@ -2593,676 +2524,554 @@ ; SSE2-NEXT: pslld $16, %xmm8 ; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: packssdw %xmm0, %xmm8 -; SSE2-NEXT: movd %xmm12, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 +; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB9_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: movw %ax, (%rdi) +; SSE2-NEXT: movd %xmm8, %ecx +; SSE2-NEXT: movw %cx, (%rdi) ; SSE2-NEXT: .LBB9_2: # %else -; SSE2-NEXT: pxor %xmm9, %xmm12 -; SSE2-NEXT: pextrw $2, %xmm12, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB9_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm8, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: pextrw $1, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) ; SSE2-NEXT: .LBB9_4: # %else2 -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: pxor %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB9_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm8, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: pextrw $2, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) ; SSE2-NEXT: .LBB9_6: # %else4 ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB9_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm8, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: pextrw $3, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: .LBB9_8: # %else6 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm9, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm9 ; SSE2-NEXT: pand %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB9_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm8, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: pextrw $4, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) ; SSE2-NEXT: .LBB9_10: # %else8 ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB9_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm8, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: pextrw $5, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) ; SSE2-NEXT: .LBB9_12: # %else10 ; SSE2-NEXT: pslld $16, %xmm3 ; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB9_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm8, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: pextrw $6, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) ; SSE2-NEXT: .LBB9_14: # %else12 ; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB9_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm8, %eax -; SSE2-NEXT: movw %ax, 14(%rdi) +; SSE2-NEXT: pextrw $7, %xmm8, %ecx +; SSE2-NEXT: movw %cx, 14(%rdi) ; SSE2-NEXT: .LBB9_16: # %else14 ; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_18 -; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movw %ax, 16(%rdi) -; SSE2-NEXT: .LBB9_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_20 -; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: movw %ax, 18(%rdi) +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: jne .LBB9_17 +; SSE2-NEXT: # %bb.18: # %else16 +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: jne .LBB9_19 ; SSE2-NEXT: .LBB9_20: # %else18 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_22 -; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movw %ax, 20(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: jne .LBB9_21 ; SSE2-NEXT: .LBB9_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_24 -; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: movw %ax, 22(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: jne .LBB9_23 ; SSE2-NEXT: .LBB9_24: # %else22 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_26 -; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movw %ax, 24(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: jne .LBB9_25 ; SSE2-NEXT: .LBB9_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_28 -; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: movw %ax, 26(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: jne .LBB9_27 ; SSE2-NEXT: .LBB9_28: # %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB9_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: movw %ax, 28(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: jne .LBB9_29 ; SSE2-NEXT: .LBB9_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB9_31 +; SSE2-NEXT: .LBB9_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB9_17: # %cond.store15 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movw %cx, 16(%rdi) +; SSE2-NEXT: testl $512, %eax # imm = 0x200 +; SSE2-NEXT: je .LBB9_20 +; SSE2-NEXT: .LBB9_19: # %cond.store17 +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 18(%rdi) +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: je .LBB9_22 +; SSE2-NEXT: .LBB9_21: # %cond.store19 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 20(%rdi) +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 +; SSE2-NEXT: je .LBB9_24 +; SSE2-NEXT: .LBB9_23: # %cond.store21 +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 22(%rdi) +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: je .LBB9_26 +; SSE2-NEXT: .LBB9_25: # %cond.store23 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 24(%rdi) +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE2-NEXT: je .LBB9_28 +; SSE2-NEXT: .LBB9_27: # %cond.store25 +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 26(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: je .LBB9_30 +; SSE2-NEXT: .LBB9_29: # %cond.store27 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 28(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB9_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB9_31: # %cond.store29 ; SSE2-NEXT: pextrw $7, %xmm2, %eax ; SSE2-NEXT: movw %ax, 30(%rdi) -; SSE2-NEXT: .LBB9_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE4-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: pxor %xmm9, %xmm9 ; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535] ; SSE4-NEXT: pminud %xmm8, %xmm1 ; SSE4-NEXT: pminud %xmm8, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB9_2: # %else -; SSE4-NEXT: pextrb $4, %xmm9, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB9_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB9_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB9_3 +; SSE4-NEXT: .LBB9_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB9_5 ; SSE4-NEXT: .LBB9_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB9_7 ; SSE4-NEXT: .LBB9_8: # %else6 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB9_9 ; SSE4-NEXT: .LBB9_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB9_11 ; SSE4-NEXT: .LBB9_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB9_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB9_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) ; SSE4-NEXT: .LBB9_14: # %else12 ; SSE4-NEXT: pminud %xmm8, %xmm3 ; SSE4-NEXT: pminud %xmm8, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB9_16 ; SSE4-NEXT: # %bb.15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) ; SSE4-NEXT: .LBB9_16: # %else14 ; SSE4-NEXT: packusdw %xmm3, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB9_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB9_17 +; SSE4-NEXT: # %bb.18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB9_19 +; SSE4-NEXT: .LBB9_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB9_21 +; SSE4-NEXT: .LBB9_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB9_23 +; SSE4-NEXT: .LBB9_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB9_25 +; SSE4-NEXT: .LBB9_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB9_27 +; SSE4-NEXT: .LBB9_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB9_29 +; SSE4-NEXT: .LBB9_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB9_31 +; SSE4-NEXT: .LBB9_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB9_1: # %cond.store +; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB9_4 +; SSE4-NEXT: .LBB9_3: # %cond.store1 +; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB9_6 +; SSE4-NEXT: .LBB9_5: # %cond.store3 +; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: je .LBB9_8 +; SSE4-NEXT: .LBB9_7: # %cond.store5 +; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: je .LBB9_10 +; SSE4-NEXT: .LBB9_9: # %cond.store7 +; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je .LBB9_12 +; SSE4-NEXT: .LBB9_11: # %cond.store9 +; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB9_13 +; SSE4-NEXT: jmp .LBB9_14 +; SSE4-NEXT: .LBB9_17: # %cond.store15 ; SSE4-NEXT: pextrw $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB9_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB9_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB9_19: # %cond.store17 ; SSE4-NEXT: pextrw $1, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB9_20: # %else18 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm6, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB9_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB9_21: # %cond.store19 ; SSE4-NEXT: pextrw $2, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB9_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB9_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB9_23: # %cond.store21 ; SSE4-NEXT: pextrw $3, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB9_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB9_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB9_25: # %cond.store23 ; SSE4-NEXT: pextrw $4, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB9_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB9_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB9_27: # %cond.store25 ; SSE4-NEXT: pextrw $5, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB9_28: # %else26 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB9_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB9_29: # %cond.store27 ; SSE4-NEXT: pextrw $6, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB9_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB9_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB9_31: # %cond.store29 ; SSE4-NEXT: pextrw $7, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB9_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB9_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB9_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB9_3 ; AVX1-NEXT: .LBB9_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB9_5 ; AVX1-NEXT: .LBB9_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB9_7 ; AVX1-NEXT: .LBB9_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB9_9 ; AVX1-NEXT: .LBB9_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB9_11 ; AVX1-NEXT: .LBB9_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB9_13 ; AVX1-NEXT: .LBB9_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB9_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB9_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB9_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 -; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB9_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 -; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX1-NEXT: jne .LBB9_17 +; AVX1-NEXT: # %bb.18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB9_19 ; AVX1-NEXT: .LBB9_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB9_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 -; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB9_21 ; AVX1-NEXT: .LBB9_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB9_23 +; AVX1-NEXT: .LBB9_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB9_25 +; AVX1-NEXT: .LBB9_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB9_27 +; AVX1-NEXT: .LBB9_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB9_29 +; AVX1-NEXT: .LBB9_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB9_31 +; AVX1-NEXT: .LBB9_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB9_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB9_4 +; AVX1-NEXT: .LBB9_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je .LBB9_6 +; AVX1-NEXT: .LBB9_5: # %cond.store3 +; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je .LBB9_8 +; AVX1-NEXT: .LBB9_7: # %cond.store5 +; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je .LBB9_10 +; AVX1-NEXT: .LBB9_9: # %cond.store7 +; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je .LBB9_12 +; AVX1-NEXT: .LBB9_11: # %cond.store9 +; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je .LBB9_14 +; AVX1-NEXT: .LBB9_13: # %cond.store11 +; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB9_15 +; AVX1-NEXT: jmp .LBB9_16 +; AVX1-NEXT: .LBB9_17: # %cond.store15 +; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: je .LBB9_20 +; AVX1-NEXT: .LBB9_19: # %cond.store17 +; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: je .LBB9_22 +; AVX1-NEXT: .LBB9_21: # %cond.store19 +; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB9_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB9_23: # %cond.store21 ; AVX1-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB9_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB9_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB9_25: # %cond.store23 ; AVX1-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB9_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB9_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB9_27: # %cond.store25 ; AVX1-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB9_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB9_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB9_29: # %cond.store27 ; AVX1-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB9_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB9_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB9_31: # %cond.store29 ; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB9_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm7, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB9_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB9_3 +; AVX2-NEXT: .LBB9_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB9_5 +; AVX2-NEXT: .LBB9_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB9_7 +; AVX2-NEXT: .LBB9_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB9_9 +; AVX2-NEXT: .LBB9_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB9_11 +; AVX2-NEXT: .LBB9_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB9_13 +; AVX2-NEXT: .LBB9_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je .LBB9_16 +; AVX2-NEXT: .LBB9_15: # %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX2-NEXT: .LBB9_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB9_17 +; AVX2-NEXT: # %bb.18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB9_19 +; AVX2-NEXT: .LBB9_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB9_21 +; AVX2-NEXT: .LBB9_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB9_23 +; AVX2-NEXT: .LBB9_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB9_25 +; AVX2-NEXT: .LBB9_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB9_27 +; AVX2-NEXT: .LBB9_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB9_29 +; AVX2-NEXT: .LBB9_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB9_31 +; AVX2-NEXT: .LBB9_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB9_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB9_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB9_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB9_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB9_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB9_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB9_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB9_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB9_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB9_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB9_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB9_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB9_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB9_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB9_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB9_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB9_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB9_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB9_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB9_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB9_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 -; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB9_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB9_15 +; AVX2-NEXT: jmp .LBB9_16 +; AVX2-NEXT: .LBB9_17: # %cond.store15 ; AVX2-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB9_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB9_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB9_19: # %cond.store17 ; AVX2-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB9_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB9_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB9_21: # %cond.store19 ; AVX2-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB9_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB9_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB9_23: # %cond.store21 ; AVX2-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB9_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB9_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB9_25: # %cond.store23 ; AVX2-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB9_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB9_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB9_27: # %cond.store25 ; AVX2-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB9_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB9_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB9_29: # %cond.store27 ; AVX2-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB9_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB9_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB9_31: # %cond.store29 ; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB9_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3272,116 +3081,117 @@ ; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB9_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB9_3 +; AVX512F-NEXT: .LBB9_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB9_5 +; AVX512F-NEXT: .LBB9_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB9_7 +; AVX512F-NEXT: .LBB9_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB9_9 +; AVX512F-NEXT: .LBB9_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB9_11 +; AVX512F-NEXT: .LBB9_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB9_13 +; AVX512F-NEXT: .LBB9_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: je .LBB9_16 +; AVX512F-NEXT: .LBB9_15: # %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512F-NEXT: .LBB9_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB9_17 +; AVX512F-NEXT: # %bb.18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB9_19 +; AVX512F-NEXT: .LBB9_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB9_21 +; AVX512F-NEXT: .LBB9_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB9_23 +; AVX512F-NEXT: .LBB9_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB9_25 +; AVX512F-NEXT: .LBB9_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB9_27 +; AVX512F-NEXT: .LBB9_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB9_29 +; AVX512F-NEXT: .LBB9_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB9_31 +; AVX512F-NEXT: .LBB9_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB9_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB9_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB9_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB9_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB9_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB9_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB9_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB9_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB9_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB9_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB9_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB9_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB9_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB9_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB9_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB9_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB9_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB9_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB9_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB9_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB9_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 -; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB9_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB9_15 +; AVX512F-NEXT: jmp .LBB9_16 +; AVX512F-NEXT: .LBB9_17: # %cond.store15 ; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB9_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB9_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB9_19: # %cond.store17 ; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB9_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB9_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB9_21: # %cond.store19 ; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB9_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB9_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB9_23: # %cond.store21 ; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB9_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB9_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB9_25: # %cond.store23 ; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB9_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB9_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB9_27: # %cond.store25 ; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB9_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB9_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB9_29: # %cond.store27 ; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB9_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB9_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB9_31: # %cond.store29 ; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB9_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3412,7 +3222,6 @@ ; SSE2-LABEL: truncstore_v16i32_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm12 @@ -3445,158 +3254,124 @@ ; SSE2-NEXT: por %xmm2, %xmm9 ; SSE2-NEXT: packuswb %xmm1, %xmm9 ; SSE2-NEXT: packuswb %xmm9, %xmm12 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm12, %eax -; SSE2-NEXT: je .LBB10_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB10_2: # %else +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pextrw $2, %xmm8, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %eax +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: movd %xmm12, %ecx +; SSE2-NEXT: jne .LBB10_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB10_3 ; SSE2-NEXT: .LBB10_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB10_5 ; SSE2-NEXT: .LBB10_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pextrw $6, %xmm4, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB10_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB10_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $2, %xmm12, %eax +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: pextrw $2, %xmm12, %ecx ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB10_10: # %else8 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB10_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: movb %ah, 5(%rdi) +; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $3, %xmm12, %eax +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: pextrw $3, %xmm12, %ecx ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB10_14: # %else12 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB10_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: movb %ah, 7(%rdi) +; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $4, %xmm12, %eax +; SSE2-NEXT: testl $256, %eax # imm = 0x100 +; SSE2-NEXT: pextrw $4, %xmm12, %ecx ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 -; SSE2-NEXT: movb %al, 8(%rdi) +; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB10_18: # %else16 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB10_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 -; SSE2-NEXT: movb %ah, 9(%rdi) +; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $5, %xmm12, %eax +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 +; SSE2-NEXT: pextrw $5, %xmm12, %ecx ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 -; SSE2-NEXT: movb %al, 10(%rdi) +; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB10_22: # %else20 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pextrw $6, %xmm6, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB10_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 -; SSE2-NEXT: movb %ah, 11(%rdi) +; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $6, %xmm12, %eax +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE2-NEXT: pextrw $6, %xmm12, %ecx ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 -; SSE2-NEXT: movb %al, 12(%rdi) +; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB10_26: # %else24 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB10_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 -; SSE2-NEXT: movb %ah, 13(%rdi) +; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: pextrw $7, %xmm12, %eax -; SSE2-NEXT: je .LBB10_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 -; SSE2-NEXT: movb %al, 14(%rdi) -; SSE2-NEXT: .LBB10_30: # %else28 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pextrw $6, %xmm7, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB10_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 -; SSE2-NEXT: movb %ah, 15(%rdi) +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE2-NEXT: pextrw $7, %xmm12, %ecx +; SSE2-NEXT: jne .LBB10_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB10_31 ; SSE2-NEXT: .LBB10_32: # %else30 ; SSE2-NEXT: retq +; SSE2-NEXT: .LBB10_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB10_4 +; SSE2-NEXT: .LBB10_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB10_6 +; SSE2-NEXT: .LBB10_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB10_7 +; SSE2-NEXT: jmp .LBB10_8 +; SSE2-NEXT: .LBB10_29: # %cond.store27 +; SSE2-NEXT: movb %cl, 14(%rdi) +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: je .LBB10_32 +; SSE2-NEXT: .LBB10_31: # %cond.store29 +; SSE2-NEXT: movb %ch, 15(%rdi) +; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE4-NEXT: pxor %xmm9, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm9, %xmm1 ; SSE4-NEXT: pminud %xmm9, %xmm0 @@ -3605,496 +3380,407 @@ ; SSE4-NEXT: pminud %xmm9, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 ; SSE4-NEXT: packuswb %xmm2, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB10_2: # %else -; SSE4-NEXT: pextrb $4, %xmm8, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB10_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) +; SSE4-NEXT: pxor %xmm1, %xmm7 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE4-NEXT: pxor %xmm1, %xmm6 +; SSE4-NEXT: packssdw %xmm7, %xmm6 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm1, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm4 +; SSE4-NEXT: packssdw %xmm5, %xmm4 +; SSE4-NEXT: packsswb %xmm6, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB10_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB10_3 +; SSE4-NEXT: .LBB10_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB10_5 ; SSE4-NEXT: .LBB10_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB10_7 ; SSE4-NEXT: .LBB10_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 -; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB10_9 ; SSE4-NEXT: .LBB10_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB10_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 -; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB10_11 ; SSE4-NEXT: .LBB10_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm5, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB10_13 +; SSE4-NEXT: .LBB10_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB10_15 +; SSE4-NEXT: .LBB10_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB10_17 +; SSE4-NEXT: .LBB10_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB10_19 +; SSE4-NEXT: .LBB10_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB10_21 +; SSE4-NEXT: .LBB10_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB10_23 +; SSE4-NEXT: .LBB10_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB10_25 +; SSE4-NEXT: .LBB10_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB10_27 +; SSE4-NEXT: .LBB10_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB10_29 +; SSE4-NEXT: .LBB10_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB10_31 +; SSE4-NEXT: .LBB10_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB10_1: # %cond.store +; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB10_4 +; SSE4-NEXT: .LBB10_3: # %cond.store1 +; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB10_6 +; SSE4-NEXT: .LBB10_5: # %cond.store3 +; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: je .LBB10_8 +; SSE4-NEXT: .LBB10_7: # %cond.store5 +; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: je .LBB10_10 +; SSE4-NEXT: .LBB10_9: # %cond.store7 +; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: je .LBB10_12 +; SSE4-NEXT: .LBB10_11: # %cond.store9 +; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB10_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB10_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB10_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB10_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB10_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB10_16: # %else14 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB10_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB10_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB10_18: # %else16 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB10_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB10_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB10_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm6 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm6, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB10_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB10_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB10_22: # %else20 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB10_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB10_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB10_24: # %else22 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB10_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB10_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB10_26: # %else24 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB10_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB10_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB10_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB10_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB10_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB10_30: # %else28 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB10_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB10_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB10_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm6, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB10_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB10_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB10_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB10_3 +; AVX1-NEXT: .LBB10_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB10_5 +; AVX1-NEXT: .LBB10_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB10_7 +; AVX1-NEXT: .LBB10_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB10_9 +; AVX1-NEXT: .LBB10_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB10_11 +; AVX1-NEXT: .LBB10_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB10_13 +; AVX1-NEXT: .LBB10_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB10_15 +; AVX1-NEXT: .LBB10_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB10_17 +; AVX1-NEXT: .LBB10_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB10_19 +; AVX1-NEXT: .LBB10_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB10_21 +; AVX1-NEXT: .LBB10_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB10_23 +; AVX1-NEXT: .LBB10_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB10_25 +; AVX1-NEXT: .LBB10_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB10_27 +; AVX1-NEXT: .LBB10_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB10_29 +; AVX1-NEXT: .LBB10_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB10_31 +; AVX1-NEXT: .LBB10_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB10_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB10_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB10_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB10_4: # %else2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpextrb $2, %xmm5, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB10_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB10_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB10_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB10_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB10_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB10_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB10_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB10_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB10_10: # %else8 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB10_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB10_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB10_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB10_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB10_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB10_14: # %else12 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB10_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB10_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB10_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB10_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB10_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB10_18: # %else16 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB10_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB10_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB10_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB10_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB10_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB10_22: # %else20 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB10_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB10_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB10_24: # %else22 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB10_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB10_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB10_26: # %else24 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB10_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB10_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB10_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB10_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB10_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB10_30: # %else28 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB10_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB10_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB10_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm7, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm6, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB10_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB10_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB10_3 +; AVX2-NEXT: .LBB10_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB10_5 +; AVX2-NEXT: .LBB10_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB10_7 +; AVX2-NEXT: .LBB10_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB10_9 +; AVX2-NEXT: .LBB10_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB10_11 +; AVX2-NEXT: .LBB10_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB10_13 +; AVX2-NEXT: .LBB10_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB10_15 +; AVX2-NEXT: .LBB10_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB10_17 +; AVX2-NEXT: .LBB10_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB10_19 +; AVX2-NEXT: .LBB10_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB10_21 +; AVX2-NEXT: .LBB10_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB10_23 +; AVX2-NEXT: .LBB10_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB10_25 +; AVX2-NEXT: .LBB10_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB10_27 +; AVX2-NEXT: .LBB10_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB10_29 +; AVX2-NEXT: .LBB10_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB10_31 +; AVX2-NEXT: .LBB10_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB10_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB10_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB10_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB10_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB10_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm5 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpackssdw %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpextrb $2, %xmm5, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB10_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB10_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB10_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB10_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB10_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB10_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $4, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB10_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB10_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB10_10: # %else8 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB10_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB10_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB10_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB10_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB10_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB10_14: # %else12 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB10_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB10_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB10_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB10_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB10_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB10_18: # %else16 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB10_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB10_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB10_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpackssdw %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpextrb $10, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB10_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB10_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB10_22: # %else20 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB10_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB10_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB10_24: # %else22 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB10_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB10_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB10_26: # %else24 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB10_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB10_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB10_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB10_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB10_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB10_30: # %else28 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB10_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB10_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB10_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4104,115 +3790,117 @@ ; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB10_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB10_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB10_3 +; AVX512F-NEXT: .LBB10_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB10_5 +; AVX512F-NEXT: .LBB10_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB10_7 +; AVX512F-NEXT: .LBB10_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB10_9 +; AVX512F-NEXT: .LBB10_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB10_11 +; AVX512F-NEXT: .LBB10_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB10_13 +; AVX512F-NEXT: .LBB10_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB10_15 +; AVX512F-NEXT: .LBB10_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB10_17 +; AVX512F-NEXT: .LBB10_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB10_19 +; AVX512F-NEXT: .LBB10_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB10_21 +; AVX512F-NEXT: .LBB10_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB10_23 +; AVX512F-NEXT: .LBB10_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB10_25 +; AVX512F-NEXT: .LBB10_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB10_27 +; AVX512F-NEXT: .LBB10_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB10_29 +; AVX512F-NEXT: .LBB10_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB10_31 +; AVX512F-NEXT: .LBB10_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB10_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB10_2: # %else -; AVX512F-NEXT: kshiftrw $1, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB10_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB10_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB10_4: # %else2 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB10_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB10_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB10_6: # %else4 -; AVX512F-NEXT: kshiftrw $3, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB10_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB10_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB10_8: # %else6 -; AVX512F-NEXT: kshiftrw $4, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB10_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB10_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB10_10: # %else8 -; AVX512F-NEXT: kshiftrw $5, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB10_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB10_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB10_12: # %else10 -; AVX512F-NEXT: kshiftrw $6, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB10_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB10_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB10_14: # %else12 -; AVX512F-NEXT: kshiftrw $7, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB10_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB10_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB10_16: # %else14 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB10_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB10_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB10_18: # %else16 -; AVX512F-NEXT: kshiftrw $9, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB10_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB10_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB10_20: # %else18 -; AVX512F-NEXT: kshiftrw $10, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB10_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB10_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB10_22: # %else20 -; AVX512F-NEXT: kshiftrw $11, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB10_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB10_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB10_24: # %else22 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB10_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB10_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB10_26: # %else24 -; AVX512F-NEXT: kshiftrw $13, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB10_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB10_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB10_28: # %else26 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB10_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB10_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB10_30: # %else28 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB10_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB10_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB10_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4242,329 +3930,310 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: packssdw %xmm0, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm4, %xmm5 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm5, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB11_2: # %else -; SSE2-NEXT: psrlq $16, %xmm10 -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm5, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB11_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB11_3 ; SSE2-NEXT: .LBB11_4: # %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm5, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB11_5 ; SSE2-NEXT: .LBB11_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm5, %eax -; SSE2-NEXT: movw %ax, 6(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB11_7 ; SSE2-NEXT: .LBB11_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm5, %eax -; SSE2-NEXT: movw %ax, 8(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB11_9 ; SSE2-NEXT: .LBB11_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm5, %eax -; SSE2-NEXT: movw %ax, 10(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB11_11 ; SSE2-NEXT: .LBB11_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB11_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm5, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB11_13 ; SSE2-NEXT: .LBB11_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB11_15 +; SSE2-NEXT: .LBB11_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB11_1: # %cond.store +; SSE2-NEXT: movd %xmm5, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB11_4 +; SSE2-NEXT: .LBB11_3: # %cond.store1 +; SSE2-NEXT: pextrw $1, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB11_6 +; SSE2-NEXT: .LBB11_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB11_8 +; SSE2-NEXT: .LBB11_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB11_10 +; SSE2-NEXT: .LBB11_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 8(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB11_12 +; SSE2-NEXT: .LBB11_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 10(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB11_14 +; SSE2-NEXT: .LBB11_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm5, %ecx +; SSE2-NEXT: movw %cx, 12(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB11_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB11_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm5, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) -; SSE2-NEXT: .LBB11_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB11_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB11_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB11_3 +; SSE4-NEXT: .LBB11_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB11_5 +; SSE4-NEXT: .LBB11_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB11_7 +; SSE4-NEXT: .LBB11_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB11_9 +; SSE4-NEXT: .LBB11_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB11_11 +; SSE4-NEXT: .LBB11_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB11_13 +; SSE4-NEXT: .LBB11_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB11_15 +; SSE4-NEXT: .LBB11_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB11_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB11_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB11_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB11_3: # %cond.store1 ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB11_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB11_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB11_5: # %cond.store3 ; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB11_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB11_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB11_7: # %cond.store5 ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB11_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB11_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB11_9: # %cond.store7 ; SSE4-NEXT: pextrw $4, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB11_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB11_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB11_11: # %cond.store9 ; SSE4-NEXT: pextrw $5, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB11_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB11_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB11_13: # %cond.store11 ; SSE4-NEXT: pextrw $6, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB11_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB11_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB11_15: # %cond.store13 ; SSE4-NEXT: pextrw $7, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB11_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB11_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB11_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB11_3 +; AVX1-NEXT: .LBB11_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB11_5 +; AVX1-NEXT: .LBB11_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB11_7 +; AVX1-NEXT: .LBB11_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB11_9 +; AVX1-NEXT: .LBB11_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB11_11 +; AVX1-NEXT: .LBB11_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB11_13 +; AVX1-NEXT: .LBB11_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB11_15 +; AVX1-NEXT: .LBB11_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB11_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB11_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB11_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB11_3: # %cond.store1 ; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB11_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB11_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB11_5: # %cond.store3 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB11_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB11_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB11_7: # %cond.store5 ; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB11_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB11_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB11_9: # %cond.store7 ; AVX1-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB11_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB11_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB11_11: # %cond.store9 ; AVX1-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB11_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB11_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB11_13: # %cond.store11 ; AVX1-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB11_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB11_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB11_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB11_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB11_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB11_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB11_3 +; AVX2-NEXT: .LBB11_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB11_5 +; AVX2-NEXT: .LBB11_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB11_7 +; AVX2-NEXT: .LBB11_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB11_9 +; AVX2-NEXT: .LBB11_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB11_11 +; AVX2-NEXT: .LBB11_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB11_13 +; AVX2-NEXT: .LBB11_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB11_15 +; AVX2-NEXT: .LBB11_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB11_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB11_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB11_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB11_3: # %cond.store1 ; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB11_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB11_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB11_5: # %cond.store3 ; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB11_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB11_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB11_7: # %cond.store5 ; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB11_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB11_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB11_9: # %cond.store7 ; AVX2-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB11_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB11_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB11_11: # %cond.store9 ; AVX2-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB11_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB11_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB11_13: # %cond.store11 ; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB11_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB11_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB11_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB11_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4572,71 +4241,66 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB11_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB11_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB11_3 +; AVX512F-NEXT: .LBB11_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB11_5 +; AVX512F-NEXT: .LBB11_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB11_7 +; AVX512F-NEXT: .LBB11_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB11_9 +; AVX512F-NEXT: .LBB11_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB11_11 +; AVX512F-NEXT: .LBB11_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB11_13 +; AVX512F-NEXT: .LBB11_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB11_15 +; AVX512F-NEXT: .LBB11_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB11_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB11_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB11_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB11_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB11_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB11_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB11_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB11_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB11_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB11_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB11_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB11_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB11_9: # %cond.store7 ; AVX512F-NEXT: vpextrw $4, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB11_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB11_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB11_11: # %cond.store9 ; AVX512F-NEXT: vpextrw $5, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB11_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB11_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB11_13: # %cond.store11 ; AVX512F-NEXT: vpextrw $6, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB11_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB11_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB11_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB11_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4671,326 +4335,307 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: packssdw %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm7, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm4, %xmm10 -; SSE2-NEXT: movd %xmm9, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm10, %eax -; SSE2-NEXT: je .LBB12_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB12_2: # %else -; SSE2-NEXT: psrlq $16, %xmm5 -; SSE2-NEXT: movd %xmm5, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB12_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB12_4: # %else2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: packuswb %xmm4, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm10, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm6, %ecx +; SSE2-NEXT: jne .LBB12_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB12_3 +; SSE2-NEXT: .LBB12_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB12_5 ; SSE2-NEXT: .LBB12_6: # %else4 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm10, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB12_7 ; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm10, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB12_9 ; SSE2-NEXT: .LBB12_10: # %else8 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm10, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB12_11 ; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB12_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm10, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: .LBB12_14: # %else12 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB12_15 +; SSE2-NEXT: .LBB12_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB12_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB12_4 +; SSE2-NEXT: .LBB12_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB12_6 +; SSE2-NEXT: .LBB12_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm6, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB12_8 +; SSE2-NEXT: .LBB12_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm6, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB12_10 +; SSE2-NEXT: .LBB12_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm6, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB12_12 +; SSE2-NEXT: .LBB12_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm6, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB12_14 +; SSE2-NEXT: .LBB12_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm6, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB12_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm10, %eax +; SSE2-NEXT: .LBB12_15: # %cond.store13 +; SSE2-NEXT: pextrw $7, %xmm6, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB12_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE4-NEXT: pxor %xmm5, %xmm4 +; SSE4-NEXT: pxor %xmm4, %xmm4 ; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_2 -; SSE4-NEXT: # %bb.1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB12_2: # %else -; SSE4-NEXT: pextrb $4, %xmm4, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 -; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB12_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 -; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: pxor %xmm1, %xmm3 +; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: packssdw %xmm3, %xmm2 +; SSE4-NEXT: packsswb %xmm0, %xmm2 +; SSE4-NEXT: pmovmskb %xmm2, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB12_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB12_3 +; SSE4-NEXT: .LBB12_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB12_5 ; SSE4-NEXT: .LBB12_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB12_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 -; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB12_7 ; SSE4-NEXT: .LBB12_8: # %else6 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm2, %xmm1 -; SSE4-NEXT: pextrb $0, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB12_9 +; SSE4-NEXT: .LBB12_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB12_11 +; SSE4-NEXT: .LBB12_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB12_13 +; SSE4-NEXT: .LBB12_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB12_15 +; SSE4-NEXT: .LBB12_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB12_1: # %cond.store +; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: je .LBB12_4 +; SSE4-NEXT: .LBB12_3: # %cond.store1 +; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: je .LBB12_6 +; SSE4-NEXT: .LBB12_5: # %cond.store3 +; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: je .LBB12_8 +; SSE4-NEXT: .LBB12_7: # %cond.store5 +; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB12_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB12_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB12_10: # %else8 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB12_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB12_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB12_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm3, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB12_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB12_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB12_14: # %else12 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB12_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB12_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB12_16: # %else14 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB12_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB12_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB12_3 +; AVX1-NEXT: .LBB12_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB12_5 +; AVX1-NEXT: .LBB12_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB12_7 +; AVX1-NEXT: .LBB12_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB12_9 +; AVX1-NEXT: .LBB12_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB12_11 +; AVX1-NEXT: .LBB12_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB12_13 +; AVX1-NEXT: .LBB12_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB12_15 +; AVX1-NEXT: .LBB12_16: # %else14 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB12_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB12_2: # %else -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB12_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB12_3: # %cond.store1 ; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB12_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB12_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB12_5: # %cond.store3 ; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB12_6: # %else4 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB12_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB12_7: # %cond.store5 ; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB12_8: # %else6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB12_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB12_9: # %cond.store7 ; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB12_10: # %else8 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB12_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB12_11: # %cond.store9 ; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB12_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB12_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB12_13: # %cond.store11 ; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB12_14: # %else12 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB12_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB12_15: # %cond.store13 ; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB12_16: # %else14 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v8i32_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm4, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB12_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovmskps %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB12_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB12_3 +; AVX2-NEXT: .LBB12_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB12_5 +; AVX2-NEXT: .LBB12_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB12_7 +; AVX2-NEXT: .LBB12_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB12_9 +; AVX2-NEXT: .LBB12_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB12_11 +; AVX2-NEXT: .LBB12_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB12_13 +; AVX2-NEXT: .LBB12_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB12_15 +; AVX2-NEXT: .LBB12_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB12_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB12_2: # %else -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB12_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB12_3: # %cond.store1 ; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB12_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $8, %xmm4, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB12_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB12_5: # %cond.store3 ; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB12_6: # %else4 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB12_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB12_7: # %cond.store5 ; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB12_8: # %else6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB12_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB12_9: # %cond.store7 ; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB12_10: # %else8 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB12_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB12_11: # %cond.store9 ; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB12_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB12_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB12_13: # %cond.store11 ; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB12_14: # %else12 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB12_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB12_15: # %cond.store13 ; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB12_16: # %else14 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4998,71 +4643,66 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB12_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB12_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB12_3 +; AVX512F-NEXT: .LBB12_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB12_5 +; AVX512F-NEXT: .LBB12_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB12_7 +; AVX512F-NEXT: .LBB12_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB12_9 +; AVX512F-NEXT: .LBB12_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB12_11 +; AVX512F-NEXT: .LBB12_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB12_13 +; AVX512F-NEXT: .LBB12_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB12_15 +; AVX512F-NEXT: .LBB12_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB12_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB12_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB12_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB12_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB12_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB12_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB12_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB12_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB12_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB12_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB12_8: # %else6 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB12_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB12_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB12_10: # %else8 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB12_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB12_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB12_12: # %else10 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB12_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB12_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB12_14: # %else12 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB12_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB12_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB12_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5099,7 +4739,6 @@ ; SSE2-LABEL: truncstore_v4i32_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] @@ -5107,192 +4746,186 @@ ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: .LBB13_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movw %ax, 2(%rdi) +; SSE2-NEXT: jne .LBB13_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB13_3 ; SSE2-NEXT: .LBB13_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB13_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movw %ax, 4(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB13_5 ; SSE2-NEXT: .LBB13_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB13_7 +; SSE2-NEXT: .LBB13_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB13_1: # %cond.store +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movw %cx, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB13_4 +; SSE2-NEXT: .LBB13_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB13_6 +; SSE2-NEXT: .LBB13_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movw %cx, 4(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB13_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB13_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: movw %ax, 6(%rdi) -; SSE2-NEXT: .LBB13_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB13_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB13_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB13_3 +; SSE4-NEXT: .LBB13_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB13_5 +; SSE4-NEXT: .LBB13_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB13_7 +; SSE4-NEXT: .LBB13_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB13_1: # %cond.store ; SSE4-NEXT: pextrw $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB13_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB13_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB13_3: # %cond.store1 ; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB13_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB13_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB13_5: # %cond.store3 ; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB13_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB13_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB13_7: # %cond.store5 ; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB13_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i32_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB13_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB13_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB13_3 +; AVX1-NEXT: .LBB13_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB13_5 +; AVX1-NEXT: .LBB13_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB13_7 +; AVX1-NEXT: .LBB13_8: # %else6 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB13_1: # %cond.store ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB13_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB13_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB13_3: # %cond.store1 ; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB13_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB13_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB13_5: # %cond.store3 ; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB13_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB13_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB13_7: # %cond.store5 ; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB13_8: # %else6 ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i32_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB13_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB13_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB13_3 +; AVX2-NEXT: .LBB13_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB13_5 +; AVX2-NEXT: .LBB13_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB13_7 +; AVX2-NEXT: .LBB13_8: # %else6 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB13_1: # %cond.store ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB13_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB13_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB13_3: # %cond.store1 ; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB13_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB13_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB13_5: # %cond.store3 ; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB13_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB13_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB13_7: # %cond.store5 ; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB13_8: # %else6 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535] -; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB13_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB13_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB13_3 +; AVX512F-NEXT: .LBB13_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB13_5 +; AVX512F-NEXT: .LBB13_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB13_7 +; AVX512F-NEXT: .LBB13_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB13_1: # %cond.store ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB13_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB13_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB13_3: # %cond.store1 ; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB13_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB13_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB13_5: # %cond.store3 ; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB13_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB13_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB13_7: # %cond.store5 ; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB13_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5327,7 +4960,6 @@ ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] @@ -5335,192 +4967,186 @@ ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB14_2: # %else -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax +; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pextrw $2, %xmm2, %eax -; SSE2-NEXT: movb %al, 1(%rdi) +; SSE2-NEXT: jne .LBB14_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB14_3 ; SSE2-NEXT: .LBB14_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB14_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB14_5 ; SSE2-NEXT: .LBB14_6: # %else4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB14_7 +; SSE2-NEXT: .LBB14_8: # %else6 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB14_1: # %cond.store +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB14_4 +; SSE2-NEXT: .LBB14_3: # %cond.store1 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB14_6 +; SSE2-NEXT: .LBB14_5: # %cond.store3 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB14_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 +; SSE2-NEXT: .LBB14_7: # %cond.store5 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: movb %al, 3(%rdi) -; SSE2-NEXT: .LBB14_8: # %else6 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v4i32_v4i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE4-NEXT: movmskps %xmm2, %eax +; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB14_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB14_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB14_3 +; SSE4-NEXT: .LBB14_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB14_5 +; SSE4-NEXT: .LBB14_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB14_7 +; SSE4-NEXT: .LBB14_8: # %else6 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB14_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB14_2: # %else -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB14_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB14_3: # %cond.store1 ; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB14_4: # %else2 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB14_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB14_5: # %cond.store3 ; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB14_6: # %else4 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB14_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB14_7: # %cond.store5 ; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB14_8: # %else6 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v4i32_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB14_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB14_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB14_3 +; AVX1-NEXT: .LBB14_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB14_5 +; AVX1-NEXT: .LBB14_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB14_7 +; AVX1-NEXT: .LBB14_8: # %else6 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB14_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB14_2: # %else -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB14_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB14_3: # %cond.store1 ; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB14_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB14_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB14_5: # %cond.store3 ; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB14_6: # %else4 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB14_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB14_7: # %cond.store5 ; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB14_8: # %else6 ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v4i32_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255] ; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB14_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB14_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB14_3 +; AVX2-NEXT: .LBB14_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB14_5 +; AVX2-NEXT: .LBB14_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB14_7 +; AVX2-NEXT: .LBB14_8: # %else6 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB14_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB14_2: # %else -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB14_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB14_3: # %cond.store1 ; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB14_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB14_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB14_5: # %cond.store3 ; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB14_6: # %else4 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB14_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB14_7: # %cond.store5 ; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB14_8: # %else6 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB14_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB14_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB14_3 +; AVX512F-NEXT: .LBB14_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB14_5 +; AVX512F-NEXT: .LBB14_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB14_7 +; AVX512F-NEXT: .LBB14_8: # %else6 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB14_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB14_2: # %else -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB14_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB14_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB14_4: # %else2 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB14_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB14_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB14_6: # %else4 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB14_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB14_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB14_8: # %else6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5555,7 +5181,6 @@ ; SSE2-LABEL: truncstore_v32i16_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [33023,33023,33023,33023,33023,33023,33023,33023] @@ -5565,123 +5190,77 @@ ; SSE2-NEXT: pminsw %xmm8, %xmm0 ; SSE2-NEXT: pxor %xmm6, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm7, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm7, %xmm4 +; SSE2-NEXT: pmovmskb %xmm4, %ecx +; SSE2-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm7, %xmm5 +; SSE2-NEXT: pmovmskb %xmm5, %eax +; SSE2-NEXT: notl %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB15_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB15_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB15_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB15_3 ; SSE2-NEXT: .LBB15_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB15_5 ; SSE2-NEXT: .LBB15_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB15_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB15_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB15_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB15_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB15_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB15_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB15_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB15_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB15_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB15_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 ; SSE2-NEXT: pxor %xmm6, %xmm3 ; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 @@ -5689,1393 +5268,1107 @@ ; SSE2-NEXT: .LBB15_26: # %else24 ; SSE2-NEXT: pminsw %xmm8, %xmm3 ; SSE2-NEXT: pminsw %xmm8, %xmm2 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB15_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB15_28: # %else26 ; SSE2-NEXT: pxor %xmm6, %xmm3 ; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE2-NEXT: pextrw $7, %xmm4, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) ; SSE2-NEXT: .LBB15_30: # %else28 ; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB15_32 ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: je .LBB15_34 -; SSE2-NEXT: # %bb.33: # %cond.store31 -; SSE2-NEXT: movb %al, 16(%rdi) -; SSE2-NEXT: .LBB15_34: # %else32 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB15_36 -; SSE2-NEXT: # %bb.35: # %cond.store33 -; SSE2-NEXT: movb %ah, 17(%rdi) +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: jne .LBB15_33 +; SSE2-NEXT: # %bb.34: # %else32 +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: jne .LBB15_35 ; SSE2-NEXT: .LBB15_36: # %else34 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB15_38 -; SSE2-NEXT: # %bb.37: # %cond.store35 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: jne .LBB15_37 ; SSE2-NEXT: .LBB15_38: # %else36 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE2-NEXT: je .LBB15_40 -; SSE2-NEXT: # %bb.39: # %cond.store37 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 19(%rdi) +; SSE2-NEXT: .LBB15_39: # %cond.store37 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) ; SSE2-NEXT: .LBB15_42: # %else40 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE2-NEXT: je .LBB15_44 ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) ; SSE2-NEXT: .LBB15_46: # %else44 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE2-NEXT: je .LBB15_48 ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) ; SSE2-NEXT: .LBB15_50: # %else48 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE2-NEXT: je .LBB15_52 ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) ; SSE2-NEXT: .LBB15_54: # %else52 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE2-NEXT: je .LBB15_56 ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) ; SSE2-NEXT: .LBB15_58: # %else56 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE2-NEXT: je .LBB15_60 ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE2-NEXT: pextrw $7, %xmm5, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx -; SSE2-NEXT: je .LBB15_62 -; SSE2-NEXT: # %bb.61: # %cond.store59 +; SSE2-NEXT: jne .LBB15_61 +; SSE2-NEXT: # %bb.62: # %else60 +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE2-NEXT: jne .LBB15_63 +; SSE2-NEXT: .LBB15_64: # %else62 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB15_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB15_4 +; SSE2-NEXT: .LBB15_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB15_6 +; SSE2-NEXT: .LBB15_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB15_7 +; SSE2-NEXT: jmp .LBB15_8 +; SSE2-NEXT: .LBB15_33: # %cond.store31 +; SSE2-NEXT: movb %cl, 16(%rdi) +; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE2-NEXT: je .LBB15_36 +; SSE2-NEXT: .LBB15_35: # %cond.store33 +; SSE2-NEXT: movb %ch, 17(%rdi) +; SSE2-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE2-NEXT: je .LBB15_38 +; SSE2-NEXT: .LBB15_37: # %cond.store35 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 18(%rdi) +; SSE2-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE2-NEXT: jne .LBB15_39 +; SSE2-NEXT: jmp .LBB15_40 +; SSE2-NEXT: .LBB15_61: # %cond.store59 ; SSE2-NEXT: movb %cl, 30(%rdi) -; SSE2-NEXT: .LBB15_62: # %else60 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE2-NEXT: je .LBB15_64 -; SSE2-NEXT: # %bb.63: # %cond.store61 +; SSE2-NEXT: .LBB15_63: # %cond.store61 ; SSE2-NEXT: movb %ch, 31(%rdi) -; SSE2-NEXT: .LBB15_64: # %else62 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pminuw %xmm6, %xmm1 ; SSE4-NEXT: pminuw %xmm6, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm7, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: pcmpeqb %xmm7, %xmm4 +; SSE4-NEXT: pmovmskb %xmm4, %ecx +; SSE4-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE4-NEXT: pcmpeqb %xmm7, %xmm5 +; SSE4-NEXT: pmovmskb %xmm5, %eax +; SSE4-NEXT: notl %eax +; SSE4-NEXT: shll $16, %eax +; SSE4-NEXT: orl %ecx, %eax +; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: jne .LBB15_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB15_3 +; SSE4-NEXT: .LBB15_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB15_5 +; SSE4-NEXT: .LBB15_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB15_7 +; SSE4-NEXT: .LBB15_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB15_9 +; SSE4-NEXT: .LBB15_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB15_11 +; SSE4-NEXT: .LBB15_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB15_13 +; SSE4-NEXT: .LBB15_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB15_15 +; SSE4-NEXT: .LBB15_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB15_17 +; SSE4-NEXT: .LBB15_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB15_19 +; SSE4-NEXT: .LBB15_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB15_21 +; SSE4-NEXT: .LBB15_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB15_23 +; SSE4-NEXT: .LBB15_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB15_25 +; SSE4-NEXT: .LBB15_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB15_27 +; SSE4-NEXT: .LBB15_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: je .LBB15_30 +; SSE4-NEXT: .LBB15_29: # %cond.store27 +; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) +; SSE4-NEXT: .LBB15_30: # %else28 +; SSE4-NEXT: pminuw %xmm6, %xmm3 +; SSE4-NEXT: pminuw %xmm6, %xmm2 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: je .LBB15_32 +; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) +; SSE4-NEXT: .LBB15_32: # %else30 +; SSE4-NEXT: packuswb %xmm3, %xmm2 +; SSE4-NEXT: testl $65536, %eax # imm = 0x10000 +; SSE4-NEXT: jne .LBB15_33 +; SSE4-NEXT: # %bb.34: # %else32 +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 +; SSE4-NEXT: jne .LBB15_35 +; SSE4-NEXT: .LBB15_36: # %else34 +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 +; SSE4-NEXT: jne .LBB15_37 +; SSE4-NEXT: .LBB15_38: # %else36 +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 +; SSE4-NEXT: jne .LBB15_39 +; SSE4-NEXT: .LBB15_40: # %else38 +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 +; SSE4-NEXT: jne .LBB15_41 +; SSE4-NEXT: .LBB15_42: # %else40 +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 +; SSE4-NEXT: jne .LBB15_43 +; SSE4-NEXT: .LBB15_44: # %else42 +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 +; SSE4-NEXT: jne .LBB15_45 +; SSE4-NEXT: .LBB15_46: # %else44 +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 +; SSE4-NEXT: jne .LBB15_47 +; SSE4-NEXT: .LBB15_48: # %else46 +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 +; SSE4-NEXT: jne .LBB15_49 +; SSE4-NEXT: .LBB15_50: # %else48 +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 +; SSE4-NEXT: jne .LBB15_51 +; SSE4-NEXT: .LBB15_52: # %else50 +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 +; SSE4-NEXT: jne .LBB15_53 +; SSE4-NEXT: .LBB15_54: # %else52 +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 +; SSE4-NEXT: jne .LBB15_55 +; SSE4-NEXT: .LBB15_56: # %else54 +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 +; SSE4-NEXT: jne .LBB15_57 +; SSE4-NEXT: .LBB15_58: # %else56 +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 +; SSE4-NEXT: jne .LBB15_59 +; SSE4-NEXT: .LBB15_60: # %else58 +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; SSE4-NEXT: jne .LBB15_61 +; SSE4-NEXT: .LBB15_62: # %else60 +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; SSE4-NEXT: jne .LBB15_63 +; SSE4-NEXT: .LBB15_64: # %else62 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB15_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB15_2: # %else -; SSE4-NEXT: pextrb $1, %xmm7, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB15_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB15_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB15_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB15_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB15_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB15_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB15_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB15_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB15_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB15_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB15_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB15_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB15_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB15_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB15_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB15_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB15_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB15_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB15_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB15_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB15_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB15_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB15_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB15_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB15_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB15_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB15_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB15_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB15_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB15_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB15_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB15_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB15_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB15_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB15_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB15_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB15_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB15_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB15_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE4-NEXT: pextrb $14, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 -; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB15_30: # %else28 -; SSE4-NEXT: pminuw %xmm6, %xmm3 -; SSE4-NEXT: pminuw %xmm6, %xmm2 -; SSE4-NEXT: pextrb $15, %xmm4, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 -; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB15_32: # %else30 -; SSE4-NEXT: packuswb %xmm3, %xmm2 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB15_34 -; SSE4-NEXT: # %bb.33: # %cond.store31 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB15_29 +; SSE4-NEXT: jmp .LBB15_30 +; SSE4-NEXT: .LBB15_33: # %cond.store31 ; SSE4-NEXT: pextrb $0, %xmm2, 16(%rdi) -; SSE4-NEXT: .LBB15_34: # %else32 -; SSE4-NEXT: pextrb $1, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $131072, %eax # imm = 0x20000 ; SSE4-NEXT: je .LBB15_36 -; SSE4-NEXT: # %bb.35: # %cond.store33 +; SSE4-NEXT: .LBB15_35: # %cond.store33 ; SSE4-NEXT: pextrb $1, %xmm2, 17(%rdi) -; SSE4-NEXT: .LBB15_36: # %else34 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $2, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $262144, %eax # imm = 0x40000 ; SSE4-NEXT: je .LBB15_38 -; SSE4-NEXT: # %bb.37: # %cond.store35 +; SSE4-NEXT: .LBB15_37: # %cond.store35 ; SSE4-NEXT: pextrb $2, %xmm2, 18(%rdi) -; SSE4-NEXT: .LBB15_38: # %else36 -; SSE4-NEXT: pextrb $3, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $524288, %eax # imm = 0x80000 ; SSE4-NEXT: je .LBB15_40 -; SSE4-NEXT: # %bb.39: # %cond.store37 +; SSE4-NEXT: .LBB15_39: # %cond.store37 ; SSE4-NEXT: pextrb $3, %xmm2, 19(%rdi) -; SSE4-NEXT: .LBB15_40: # %else38 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $4, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE4-NEXT: je .LBB15_42 -; SSE4-NEXT: # %bb.41: # %cond.store39 +; SSE4-NEXT: .LBB15_41: # %cond.store39 ; SSE4-NEXT: pextrb $4, %xmm2, 20(%rdi) -; SSE4-NEXT: .LBB15_42: # %else40 -; SSE4-NEXT: pextrb $5, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2097152, %eax # imm = 0x200000 ; SSE4-NEXT: je .LBB15_44 -; SSE4-NEXT: # %bb.43: # %cond.store41 +; SSE4-NEXT: .LBB15_43: # %cond.store41 ; SSE4-NEXT: pextrb $5, %xmm2, 21(%rdi) -; SSE4-NEXT: .LBB15_44: # %else42 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $6, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE4-NEXT: je .LBB15_46 -; SSE4-NEXT: # %bb.45: # %cond.store43 +; SSE4-NEXT: .LBB15_45: # %cond.store43 ; SSE4-NEXT: pextrb $6, %xmm2, 22(%rdi) -; SSE4-NEXT: .LBB15_46: # %else44 -; SSE4-NEXT: pextrb $7, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8388608, %eax # imm = 0x800000 ; SSE4-NEXT: je .LBB15_48 -; SSE4-NEXT: # %bb.47: # %cond.store45 +; SSE4-NEXT: .LBB15_47: # %cond.store45 ; SSE4-NEXT: pextrb $7, %xmm2, 23(%rdi) -; SSE4-NEXT: .LBB15_48: # %else46 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $8, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE4-NEXT: je .LBB15_50 -; SSE4-NEXT: # %bb.49: # %cond.store47 +; SSE4-NEXT: .LBB15_49: # %cond.store47 ; SSE4-NEXT: pextrb $8, %xmm2, 24(%rdi) -; SSE4-NEXT: .LBB15_50: # %else48 -; SSE4-NEXT: pextrb $9, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $33554432, %eax # imm = 0x2000000 ; SSE4-NEXT: je .LBB15_52 -; SSE4-NEXT: # %bb.51: # %cond.store49 +; SSE4-NEXT: .LBB15_51: # %cond.store49 ; SSE4-NEXT: pextrb $9, %xmm2, 25(%rdi) -; SSE4-NEXT: .LBB15_52: # %else50 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $10, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE4-NEXT: je .LBB15_54 -; SSE4-NEXT: # %bb.53: # %cond.store51 +; SSE4-NEXT: .LBB15_53: # %cond.store51 ; SSE4-NEXT: pextrb $10, %xmm2, 26(%rdi) -; SSE4-NEXT: .LBB15_54: # %else52 -; SSE4-NEXT: pextrb $11, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $134217728, %eax # imm = 0x8000000 ; SSE4-NEXT: je .LBB15_56 -; SSE4-NEXT: # %bb.55: # %cond.store53 +; SSE4-NEXT: .LBB15_55: # %cond.store53 ; SSE4-NEXT: pextrb $11, %xmm2, 27(%rdi) -; SSE4-NEXT: .LBB15_56: # %else54 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE4-NEXT: pextrb $12, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE4-NEXT: je .LBB15_58 -; SSE4-NEXT: # %bb.57: # %cond.store55 +; SSE4-NEXT: .LBB15_57: # %cond.store55 ; SSE4-NEXT: pextrb $12, %xmm2, 28(%rdi) -; SSE4-NEXT: .LBB15_58: # %else56 -; SSE4-NEXT: pextrb $13, %xmm0, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $536870912, %eax # imm = 0x20000000 ; SSE4-NEXT: je .LBB15_60 -; SSE4-NEXT: # %bb.59: # %cond.store57 +; SSE4-NEXT: .LBB15_59: # %cond.store57 ; SSE4-NEXT: pextrb $13, %xmm2, 29(%rdi) -; SSE4-NEXT: .LBB15_60: # %else58 -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pcmpeqb %xmm0, %xmm5 -; SSE4-NEXT: pextrb $14, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE4-NEXT: je .LBB15_62 -; SSE4-NEXT: # %bb.61: # %cond.store59 +; SSE4-NEXT: .LBB15_61: # %cond.store59 ; SSE4-NEXT: pextrb $14, %xmm2, 30(%rdi) -; SSE4-NEXT: .LBB15_62: # %else60 -; SSE4-NEXT: pextrb $15, %xmm5, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; SSE4-NEXT: je .LBB15_64 -; SSE4-NEXT: # %bb.63: # %cond.store61 +; SSE4-NEXT: .LBB15_63: # %cond.store61 ; SSE4-NEXT: pextrb $15, %xmm2, 31(%rdi) -; SSE4-NEXT: .LBB15_64: # %else62 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpminuw %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpminuw %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpminuw %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpminuw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrb $0, %xmm5, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_2 -; AVX1-NEXT: # %bb.1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB15_2: # %else -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB15_4: # %else2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $2, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 -; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB15_6: # %else4 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpmovmskb %xmm3, %ecx +; AVX1-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB15_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB15_3 +; AVX1-NEXT: .LBB15_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB15_5 +; AVX1-NEXT: .LBB15_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB15_7 ; AVX1-NEXT: .LBB15_8: # %else6 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $4, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 -; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB15_9 ; AVX1-NEXT: .LBB15_10: # %else8 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 -; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB15_11 ; AVX1-NEXT: .LBB15_12: # %else10 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $6, %xmm4, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 -; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB15_13 ; AVX1-NEXT: .LBB15_14: # %else12 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 -; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB15_15 ; AVX1-NEXT: .LBB15_16: # %else14 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $8, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB15_17 +; AVX1-NEXT: .LBB15_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB15_19 +; AVX1-NEXT: .LBB15_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB15_21 +; AVX1-NEXT: .LBB15_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB15_23 +; AVX1-NEXT: .LBB15_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB15_25 +; AVX1-NEXT: .LBB15_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB15_27 +; AVX1-NEXT: .LBB15_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB15_29 +; AVX1-NEXT: .LBB15_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: je .LBB15_32 +; AVX1-NEXT: .LBB15_31: # %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX1-NEXT: .LBB15_32: # %else30 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: jne .LBB15_33 +; AVX1-NEXT: # %bb.34: # %else32 +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX1-NEXT: jne .LBB15_35 +; AVX1-NEXT: .LBB15_36: # %else34 +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX1-NEXT: jne .LBB15_37 +; AVX1-NEXT: .LBB15_38: # %else36 +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX1-NEXT: jne .LBB15_39 +; AVX1-NEXT: .LBB15_40: # %else38 +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX1-NEXT: jne .LBB15_41 +; AVX1-NEXT: .LBB15_42: # %else40 +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX1-NEXT: jne .LBB15_43 +; AVX1-NEXT: .LBB15_44: # %else42 +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX1-NEXT: jne .LBB15_45 +; AVX1-NEXT: .LBB15_46: # %else44 +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX1-NEXT: jne .LBB15_47 +; AVX1-NEXT: .LBB15_48: # %else46 +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX1-NEXT: jne .LBB15_49 +; AVX1-NEXT: .LBB15_50: # %else48 +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX1-NEXT: jne .LBB15_51 +; AVX1-NEXT: .LBB15_52: # %else50 +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX1-NEXT: jne .LBB15_53 +; AVX1-NEXT: .LBB15_54: # %else52 +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX1-NEXT: jne .LBB15_55 +; AVX1-NEXT: .LBB15_56: # %else54 +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX1-NEXT: jne .LBB15_57 +; AVX1-NEXT: .LBB15_58: # %else56 +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX1-NEXT: jne .LBB15_59 +; AVX1-NEXT: .LBB15_60: # %else58 +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX1-NEXT: jne .LBB15_61 +; AVX1-NEXT: .LBB15_62: # %else60 +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX1-NEXT: jne .LBB15_63 +; AVX1-NEXT: .LBB15_64: # %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB15_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB15_4 +; AVX1-NEXT: .LBB15_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je .LBB15_6 +; AVX1-NEXT: .LBB15_5: # %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je .LBB15_8 +; AVX1-NEXT: .LBB15_7: # %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je .LBB15_10 +; AVX1-NEXT: .LBB15_9: # %cond.store7 +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je .LBB15_12 +; AVX1-NEXT: .LBB15_11: # %cond.store9 +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je .LBB15_14 +; AVX1-NEXT: .LBB15_13: # %cond.store11 +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je .LBB15_16 +; AVX1-NEXT: .LBB15_15: # %cond.store13 +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB15_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB15_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB15_18: # %else16 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB15_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB15_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB15_20: # %else18 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $10, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB15_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB15_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB15_22: # %else20 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB15_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB15_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB15_24: # %else22 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $12, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB15_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB15_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB15_26: # %else24 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB15_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB15_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB15_28: # %else26 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpextrb $14, %xmm4, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB15_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB15_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB15_30: # %else28 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB15_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 -; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: je .LBB15_34 -; AVX1-NEXT: # %bb.33: # %cond.store31 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB15_31 +; AVX1-NEXT: jmp .LBB15_32 +; AVX1-NEXT: .LBB15_33: # %cond.store31 ; AVX1-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX1-NEXT: .LBB15_34: # %else32 -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX1-NEXT: je .LBB15_36 -; AVX1-NEXT: # %bb.35: # %cond.store33 +; AVX1-NEXT: .LBB15_35: # %cond.store33 ; AVX1-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX1-NEXT: .LBB15_36: # %else34 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX1-NEXT: je .LBB15_38 -; AVX1-NEXT: # %bb.37: # %cond.store35 +; AVX1-NEXT: .LBB15_37: # %cond.store35 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX1-NEXT: .LBB15_38: # %else36 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX1-NEXT: je .LBB15_40 -; AVX1-NEXT: # %bb.39: # %cond.store37 +; AVX1-NEXT: .LBB15_39: # %cond.store37 ; AVX1-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX1-NEXT: .LBB15_40: # %else38 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX1-NEXT: je .LBB15_42 -; AVX1-NEXT: # %bb.41: # %cond.store39 +; AVX1-NEXT: .LBB15_41: # %cond.store39 ; AVX1-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX1-NEXT: .LBB15_42: # %else40 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX1-NEXT: je .LBB15_44 -; AVX1-NEXT: # %bb.43: # %cond.store41 +; AVX1-NEXT: .LBB15_43: # %cond.store41 ; AVX1-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX1-NEXT: .LBB15_44: # %else42 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX1-NEXT: je .LBB15_46 -; AVX1-NEXT: # %bb.45: # %cond.store43 +; AVX1-NEXT: .LBB15_45: # %cond.store43 ; AVX1-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX1-NEXT: .LBB15_46: # %else44 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX1-NEXT: je .LBB15_48 -; AVX1-NEXT: # %bb.47: # %cond.store45 +; AVX1-NEXT: .LBB15_47: # %cond.store45 ; AVX1-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX1-NEXT: .LBB15_48: # %else46 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX1-NEXT: je .LBB15_50 -; AVX1-NEXT: # %bb.49: # %cond.store47 +; AVX1-NEXT: .LBB15_49: # %cond.store47 ; AVX1-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX1-NEXT: .LBB15_50: # %else48 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX1-NEXT: je .LBB15_52 -; AVX1-NEXT: # %bb.51: # %cond.store49 +; AVX1-NEXT: .LBB15_51: # %cond.store49 ; AVX1-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX1-NEXT: .LBB15_52: # %else50 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX1-NEXT: je .LBB15_54 -; AVX1-NEXT: # %bb.53: # %cond.store51 +; AVX1-NEXT: .LBB15_53: # %cond.store51 ; AVX1-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX1-NEXT: .LBB15_54: # %else52 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX1-NEXT: je .LBB15_56 -; AVX1-NEXT: # %bb.55: # %cond.store53 +; AVX1-NEXT: .LBB15_55: # %cond.store53 ; AVX1-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX1-NEXT: .LBB15_56: # %else54 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX1-NEXT: je .LBB15_58 -; AVX1-NEXT: # %bb.57: # %cond.store55 +; AVX1-NEXT: .LBB15_57: # %cond.store55 ; AVX1-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX1-NEXT: .LBB15_58: # %else56 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX1-NEXT: je .LBB15_60 -; AVX1-NEXT: # %bb.59: # %cond.store57 +; AVX1-NEXT: .LBB15_59: # %cond.store57 ; AVX1-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX1-NEXT: .LBB15_60: # %else58 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX1-NEXT: je .LBB15_62 -; AVX1-NEXT: # %bb.61: # %cond.store59 +; AVX1-NEXT: .LBB15_61: # %cond.store59 ; AVX1-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX1-NEXT: .LBB15_62: # %else60 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: je .LBB15_64 -; AVX1-NEXT: # %bb.63: # %cond.store61 +; AVX1-NEXT: .LBB15_63: # %cond.store61 ; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX1-NEXT: .LBB15_64: # %else62 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpmovmskb %ymm1, %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB15_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB15_3 +; AVX2-NEXT: .LBB15_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB15_5 +; AVX2-NEXT: .LBB15_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB15_7 +; AVX2-NEXT: .LBB15_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB15_9 +; AVX2-NEXT: .LBB15_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB15_11 +; AVX2-NEXT: .LBB15_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB15_13 +; AVX2-NEXT: .LBB15_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB15_15 +; AVX2-NEXT: .LBB15_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB15_17 +; AVX2-NEXT: .LBB15_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB15_19 +; AVX2-NEXT: .LBB15_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB15_21 +; AVX2-NEXT: .LBB15_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB15_23 +; AVX2-NEXT: .LBB15_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB15_25 +; AVX2-NEXT: .LBB15_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB15_27 +; AVX2-NEXT: .LBB15_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB15_29 +; AVX2-NEXT: .LBB15_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: je .LBB15_32 +; AVX2-NEXT: .LBB15_31: # %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX2-NEXT: .LBB15_32: # %else30 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: jne .LBB15_33 +; AVX2-NEXT: # %bb.34: # %else32 +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX2-NEXT: jne .LBB15_35 +; AVX2-NEXT: .LBB15_36: # %else34 +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX2-NEXT: jne .LBB15_37 +; AVX2-NEXT: .LBB15_38: # %else36 +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX2-NEXT: jne .LBB15_39 +; AVX2-NEXT: .LBB15_40: # %else38 +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX2-NEXT: jne .LBB15_41 +; AVX2-NEXT: .LBB15_42: # %else40 +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX2-NEXT: jne .LBB15_43 +; AVX2-NEXT: .LBB15_44: # %else42 +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX2-NEXT: jne .LBB15_45 +; AVX2-NEXT: .LBB15_46: # %else44 +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX2-NEXT: jne .LBB15_47 +; AVX2-NEXT: .LBB15_48: # %else46 +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX2-NEXT: jne .LBB15_49 +; AVX2-NEXT: .LBB15_50: # %else48 +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX2-NEXT: jne .LBB15_51 +; AVX2-NEXT: .LBB15_52: # %else50 +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX2-NEXT: jne .LBB15_53 +; AVX2-NEXT: .LBB15_54: # %else52 +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX2-NEXT: jne .LBB15_55 +; AVX2-NEXT: .LBB15_56: # %else54 +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX2-NEXT: jne .LBB15_57 +; AVX2-NEXT: .LBB15_58: # %else56 +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX2-NEXT: jne .LBB15_59 +; AVX2-NEXT: .LBB15_60: # %else58 +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX2-NEXT: jne .LBB15_61 +; AVX2-NEXT: .LBB15_62: # %else60 +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX2-NEXT: jne .LBB15_63 +; AVX2-NEXT: .LBB15_64: # %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB15_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB15_2: # %else -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB15_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB15_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB15_4: # %else2 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB15_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB15_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB15_6: # %else4 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB15_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB15_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB15_8: # %else6 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB15_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB15_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB15_10: # %else8 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB15_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB15_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB15_12: # %else10 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB15_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB15_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB15_14: # %else12 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB15_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB15_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB15_16: # %else14 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB15_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB15_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB15_18: # %else16 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB15_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB15_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB15_20: # %else18 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB15_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB15_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB15_22: # %else20 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB15_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB15_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB15_24: # %else22 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB15_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB15_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB15_26: # %else24 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB15_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB15_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB15_28: # %else26 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB15_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB15_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB15_30: # %else28 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB15_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 -; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: je .LBB15_34 -; AVX2-NEXT: # %bb.33: # %cond.store31 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB15_31 +; AVX2-NEXT: jmp .LBB15_32 +; AVX2-NEXT: .LBB15_33: # %cond.store31 ; AVX2-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX2-NEXT: .LBB15_34: # %else32 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX2-NEXT: je .LBB15_36 -; AVX2-NEXT: # %bb.35: # %cond.store33 +; AVX2-NEXT: .LBB15_35: # %cond.store33 ; AVX2-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX2-NEXT: .LBB15_36: # %else34 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX2-NEXT: je .LBB15_38 -; AVX2-NEXT: # %bb.37: # %cond.store35 +; AVX2-NEXT: .LBB15_37: # %cond.store35 ; AVX2-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX2-NEXT: .LBB15_38: # %else36 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX2-NEXT: je .LBB15_40 -; AVX2-NEXT: # %bb.39: # %cond.store37 +; AVX2-NEXT: .LBB15_39: # %cond.store37 ; AVX2-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX2-NEXT: .LBB15_40: # %else38 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX2-NEXT: je .LBB15_42 -; AVX2-NEXT: # %bb.41: # %cond.store39 +; AVX2-NEXT: .LBB15_41: # %cond.store39 ; AVX2-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX2-NEXT: .LBB15_42: # %else40 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX2-NEXT: je .LBB15_44 -; AVX2-NEXT: # %bb.43: # %cond.store41 +; AVX2-NEXT: .LBB15_43: # %cond.store41 ; AVX2-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX2-NEXT: .LBB15_44: # %else42 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX2-NEXT: je .LBB15_46 -; AVX2-NEXT: # %bb.45: # %cond.store43 +; AVX2-NEXT: .LBB15_45: # %cond.store43 ; AVX2-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX2-NEXT: .LBB15_46: # %else44 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX2-NEXT: je .LBB15_48 -; AVX2-NEXT: # %bb.47: # %cond.store45 +; AVX2-NEXT: .LBB15_47: # %cond.store45 ; AVX2-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX2-NEXT: .LBB15_48: # %else46 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX2-NEXT: je .LBB15_50 -; AVX2-NEXT: # %bb.49: # %cond.store47 +; AVX2-NEXT: .LBB15_49: # %cond.store47 ; AVX2-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX2-NEXT: .LBB15_50: # %else48 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX2-NEXT: je .LBB15_52 -; AVX2-NEXT: # %bb.51: # %cond.store49 +; AVX2-NEXT: .LBB15_51: # %cond.store49 ; AVX2-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX2-NEXT: .LBB15_52: # %else50 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX2-NEXT: je .LBB15_54 -; AVX2-NEXT: # %bb.53: # %cond.store51 +; AVX2-NEXT: .LBB15_53: # %cond.store51 ; AVX2-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX2-NEXT: .LBB15_54: # %else52 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX2-NEXT: je .LBB15_56 -; AVX2-NEXT: # %bb.55: # %cond.store53 +; AVX2-NEXT: .LBB15_55: # %cond.store53 ; AVX2-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX2-NEXT: .LBB15_56: # %else54 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX2-NEXT: je .LBB15_58 -; AVX2-NEXT: # %bb.57: # %cond.store55 +; AVX2-NEXT: .LBB15_57: # %cond.store55 ; AVX2-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX2-NEXT: .LBB15_58: # %else56 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX2-NEXT: je .LBB15_60 -; AVX2-NEXT: # %bb.59: # %cond.store57 +; AVX2-NEXT: .LBB15_59: # %cond.store57 ; AVX2-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX2-NEXT: .LBB15_60: # %else58 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX2-NEXT: je .LBB15_62 -; AVX2-NEXT: # %bb.61: # %cond.store59 +; AVX2-NEXT: .LBB15_61: # %cond.store59 ; AVX2-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX2-NEXT: .LBB15_62: # %else60 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX2-NEXT: je .LBB15_64 -; AVX2-NEXT: # %bb.63: # %cond.store61 +; AVX2-NEXT: .LBB15_63: # %cond.store61 ; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX2-NEXT: .LBB15_64: # %else62 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminuw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpminuw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB15_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB15_3 +; AVX512F-NEXT: .LBB15_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB15_5 +; AVX512F-NEXT: .LBB15_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB15_7 +; AVX512F-NEXT: .LBB15_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB15_9 +; AVX512F-NEXT: .LBB15_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB15_11 +; AVX512F-NEXT: .LBB15_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB15_13 +; AVX512F-NEXT: .LBB15_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB15_15 +; AVX512F-NEXT: .LBB15_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB15_17 +; AVX512F-NEXT: .LBB15_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB15_19 +; AVX512F-NEXT: .LBB15_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB15_21 +; AVX512F-NEXT: .LBB15_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB15_23 +; AVX512F-NEXT: .LBB15_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB15_25 +; AVX512F-NEXT: .LBB15_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB15_27 +; AVX512F-NEXT: .LBB15_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB15_29 +; AVX512F-NEXT: .LBB15_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: je .LBB15_32 +; AVX512F-NEXT: .LBB15_31: # %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512F-NEXT: .LBB15_32: # %else30 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB15_33 +; AVX512F-NEXT: # %bb.34: # %else32 +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512F-NEXT: jne .LBB15_35 +; AVX512F-NEXT: .LBB15_36: # %else34 +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512F-NEXT: jne .LBB15_37 +; AVX512F-NEXT: .LBB15_38: # %else36 +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512F-NEXT: jne .LBB15_39 +; AVX512F-NEXT: .LBB15_40: # %else38 +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512F-NEXT: jne .LBB15_41 +; AVX512F-NEXT: .LBB15_42: # %else40 +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512F-NEXT: jne .LBB15_43 +; AVX512F-NEXT: .LBB15_44: # %else42 +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512F-NEXT: jne .LBB15_45 +; AVX512F-NEXT: .LBB15_46: # %else44 +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512F-NEXT: jne .LBB15_47 +; AVX512F-NEXT: .LBB15_48: # %else46 +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512F-NEXT: jne .LBB15_49 +; AVX512F-NEXT: .LBB15_50: # %else48 +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512F-NEXT: jne .LBB15_51 +; AVX512F-NEXT: .LBB15_52: # %else50 +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512F-NEXT: jne .LBB15_53 +; AVX512F-NEXT: .LBB15_54: # %else52 +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512F-NEXT: jne .LBB15_55 +; AVX512F-NEXT: .LBB15_56: # %else54 +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512F-NEXT: jne .LBB15_57 +; AVX512F-NEXT: .LBB15_58: # %else56 +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512F-NEXT: jne .LBB15_59 +; AVX512F-NEXT: .LBB15_60: # %else58 +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512F-NEXT: jne .LBB15_61 +; AVX512F-NEXT: .LBB15_62: # %else60 +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-NEXT: jne .LBB15_63 +; AVX512F-NEXT: .LBB15_64: # %else62 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB15_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB15_2: # %else -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB15_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB15_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB15_4: # %else2 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB15_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB15_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB15_6: # %else4 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB15_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB15_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB15_8: # %else6 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB15_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB15_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB15_10: # %else8 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB15_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB15_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB15_12: # %else10 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB15_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB15_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB15_14: # %else12 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB15_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB15_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB15_16: # %else14 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB15_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB15_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB15_18: # %else16 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB15_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB15_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB15_20: # %else18 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB15_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB15_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB15_22: # %else20 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB15_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB15_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB15_24: # %else22 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB15_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB15_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB15_26: # %else24 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB15_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB15_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB15_28: # %else26 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB15_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB15_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB15_30: # %else28 -; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB15_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 -; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: je .LBB15_34 -; AVX512F-NEXT: # %bb.33: # %cond.store31 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB15_31 +; AVX512F-NEXT: jmp .LBB15_32 +; AVX512F-NEXT: .LBB15_33: # %cond.store31 ; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) -; AVX512F-NEXT: .LBB15_34: # %else32 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 ; AVX512F-NEXT: je .LBB15_36 -; AVX512F-NEXT: # %bb.35: # %cond.store33 +; AVX512F-NEXT: .LBB15_35: # %cond.store33 ; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) -; AVX512F-NEXT: .LBB15_36: # %else34 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $262144, %eax # imm = 0x40000 ; AVX512F-NEXT: je .LBB15_38 -; AVX512F-NEXT: # %bb.37: # %cond.store35 +; AVX512F-NEXT: .LBB15_37: # %cond.store35 ; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) -; AVX512F-NEXT: .LBB15_38: # %else36 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $524288, %eax # imm = 0x80000 ; AVX512F-NEXT: je .LBB15_40 -; AVX512F-NEXT: # %bb.39: # %cond.store37 +; AVX512F-NEXT: .LBB15_39: # %cond.store37 ; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) -; AVX512F-NEXT: .LBB15_40: # %else38 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1048576, %eax # imm = 0x100000 ; AVX512F-NEXT: je .LBB15_42 -; AVX512F-NEXT: # %bb.41: # %cond.store39 +; AVX512F-NEXT: .LBB15_41: # %cond.store39 ; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) -; AVX512F-NEXT: .LBB15_42: # %else40 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2097152, %eax # imm = 0x200000 ; AVX512F-NEXT: je .LBB15_44 -; AVX512F-NEXT: # %bb.43: # %cond.store41 +; AVX512F-NEXT: .LBB15_43: # %cond.store41 ; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) -; AVX512F-NEXT: .LBB15_44: # %else42 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4194304, %eax # imm = 0x400000 ; AVX512F-NEXT: je .LBB15_46 -; AVX512F-NEXT: # %bb.45: # %cond.store43 +; AVX512F-NEXT: .LBB15_45: # %cond.store43 ; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) -; AVX512F-NEXT: .LBB15_46: # %else44 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8388608, %eax # imm = 0x800000 ; AVX512F-NEXT: je .LBB15_48 -; AVX512F-NEXT: # %bb.47: # %cond.store45 +; AVX512F-NEXT: .LBB15_47: # %cond.store45 ; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) -; AVX512F-NEXT: .LBB15_48: # %else46 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16777216, %eax # imm = 0x1000000 ; AVX512F-NEXT: je .LBB15_50 -; AVX512F-NEXT: # %bb.49: # %cond.store47 +; AVX512F-NEXT: .LBB15_49: # %cond.store47 ; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) -; AVX512F-NEXT: .LBB15_50: # %else48 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $33554432, %eax # imm = 0x2000000 ; AVX512F-NEXT: je .LBB15_52 -; AVX512F-NEXT: # %bb.51: # %cond.store49 +; AVX512F-NEXT: .LBB15_51: # %cond.store49 ; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) -; AVX512F-NEXT: .LBB15_52: # %else50 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $67108864, %eax # imm = 0x4000000 ; AVX512F-NEXT: je .LBB15_54 -; AVX512F-NEXT: # %bb.53: # %cond.store51 +; AVX512F-NEXT: .LBB15_53: # %cond.store51 ; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) -; AVX512F-NEXT: .LBB15_54: # %else52 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $134217728, %eax # imm = 0x8000000 ; AVX512F-NEXT: je .LBB15_56 -; AVX512F-NEXT: # %bb.55: # %cond.store53 +; AVX512F-NEXT: .LBB15_55: # %cond.store53 ; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) -; AVX512F-NEXT: .LBB15_56: # %else54 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $268435456, %eax # imm = 0x10000000 ; AVX512F-NEXT: je .LBB15_58 -; AVX512F-NEXT: # %bb.57: # %cond.store55 +; AVX512F-NEXT: .LBB15_57: # %cond.store55 ; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) -; AVX512F-NEXT: .LBB15_58: # %else56 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $536870912, %eax # imm = 0x20000000 ; AVX512F-NEXT: je .LBB15_60 -; AVX512F-NEXT: # %bb.59: # %cond.store57 +; AVX512F-NEXT: .LBB15_59: # %cond.store57 ; AVX512F-NEXT: vpextrb $13, %xmm0, 29(%rdi) -; AVX512F-NEXT: .LBB15_60: # %else58 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; AVX512F-NEXT: je .LBB15_62 -; AVX512F-NEXT: # %bb.61: # %cond.store59 +; AVX512F-NEXT: .LBB15_61: # %cond.store59 ; AVX512F-NEXT: vpextrb $14, %xmm0, 30(%rdi) -; AVX512F-NEXT: .LBB15_62: # %else60 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX512F-NEXT: je .LBB15_64 -; AVX512F-NEXT: # %bb.63: # %cond.store61 +; AVX512F-NEXT: .LBB15_63: # %cond.store61 ; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) -; AVX512F-NEXT: .LBB15_64: # %else62 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7107,7 +6400,6 @@ ; SSE2-LABEL: truncstore_v16i16_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] @@ -7117,751 +6409,605 @@ ; SSE2-NEXT: pminsw %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: notb %al +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: pmovmskb %xmm3, %eax +; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB16_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB16_2: # %else -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB16_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: movb %ah, 1(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB16_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB16_3 ; SSE2-NEXT: .LBB16_4: # %else2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: notb %dl -; SSE2-NEXT: testb $1, %dl -; SSE2-NEXT: je .LBB16_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: movl %eax, %edx -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB16_5 ; SSE2-NEXT: .LBB16_6: # %else4 -; SSE2-NEXT: shrl $24, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB16_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: shrl $24, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: .LBB16_7: # %cond.store5 +; SSE2-NEXT: shrl $24, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) ; SSE2-NEXT: .LBB16_10: # %else8 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB16_12 ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) ; SSE2-NEXT: .LBB16_14: # %else12 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB16_16 ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $4, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) ; SSE2-NEXT: .LBB16_18: # %else16 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $512, %eax # imm = 0x200 ; SSE2-NEXT: je .LBB16_20 ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) ; SSE2-NEXT: .LBB16_22: # %else20 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $2048, %eax # imm = 0x800 ; SSE2-NEXT: je .LBB16_24 ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-NEXT: pextrw $6, %xmm1, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB16_26: # %else24 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB16_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: notb %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx -; SSE2-NEXT: je .LBB16_30 -; SSE2-NEXT: # %bb.29: # %cond.store27 +; SSE2-NEXT: jne .LBB16_29 +; SSE2-NEXT: # %bb.30: # %else28 +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE2-NEXT: jne .LBB16_31 +; SSE2-NEXT: .LBB16_32: # %else30 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB16_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB16_4 +; SSE2-NEXT: .LBB16_3: # %cond.store1 +; SSE2-NEXT: movb %ch, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB16_6 +; SSE2-NEXT: .LBB16_5: # %cond.store3 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB16_7 +; SSE2-NEXT: jmp .LBB16_8 +; SSE2-NEXT: .LBB16_29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) -; SSE2-NEXT: .LBB16_30: # %else28 -; SSE2-NEXT: shrl $8, %eax -; SSE2-NEXT: notb %al -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: je .LBB16_32 -; SSE2-NEXT: # %bb.31: # %cond.store29 +; SSE2-NEXT: .LBB16_31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) -; SSE2-NEXT: .LBB16_32: # %else30 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pminuw %xmm4, %xmm1 ; SSE4-NEXT: pminuw %xmm4, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 -; SSE4-NEXT: pextrb $0, %xmm3, %eax -; SSE4-NEXT: notb %al +; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE4-NEXT: pmovmskb %xmm3, %eax +; SSE4-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB16_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB16_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB16_3 +; SSE4-NEXT: .LBB16_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB16_5 +; SSE4-NEXT: .LBB16_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB16_7 +; SSE4-NEXT: .LBB16_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB16_9 +; SSE4-NEXT: .LBB16_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB16_11 +; SSE4-NEXT: .LBB16_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB16_13 +; SSE4-NEXT: .LBB16_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB16_15 +; SSE4-NEXT: .LBB16_16: # %else14 +; SSE4-NEXT: testl $256, %eax # imm = 0x100 +; SSE4-NEXT: jne .LBB16_17 +; SSE4-NEXT: .LBB16_18: # %else16 +; SSE4-NEXT: testl $512, %eax # imm = 0x200 +; SSE4-NEXT: jne .LBB16_19 +; SSE4-NEXT: .LBB16_20: # %else18 +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 +; SSE4-NEXT: jne .LBB16_21 +; SSE4-NEXT: .LBB16_22: # %else20 +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 +; SSE4-NEXT: jne .LBB16_23 +; SSE4-NEXT: .LBB16_24: # %else22 +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 +; SSE4-NEXT: jne .LBB16_25 +; SSE4-NEXT: .LBB16_26: # %else24 +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 +; SSE4-NEXT: jne .LBB16_27 +; SSE4-NEXT: .LBB16_28: # %else26 +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 +; SSE4-NEXT: jne .LBB16_29 +; SSE4-NEXT: .LBB16_30: # %else28 +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 +; SSE4-NEXT: jne .LBB16_31 +; SSE4-NEXT: .LBB16_32: # %else30 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB16_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB16_2: # %else -; SSE4-NEXT: pextrb $1, %xmm3, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB16_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB16_3: # %cond.store1 ; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB16_4: # %else2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $2, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB16_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB16_5: # %cond.store3 ; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB16_6: # %else4 -; SSE4-NEXT: pextrb $3, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB16_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB16_7: # %cond.store5 ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB16_8: # %else6 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $4, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB16_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB16_9: # %cond.store7 ; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB16_10: # %else8 -; SSE4-NEXT: pextrb $5, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB16_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB16_11: # %cond.store9 ; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB16_12: # %else10 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $6, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB16_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB16_13: # %cond.store11 ; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB16_14: # %else12 -; SSE4-NEXT: pextrb $7, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB16_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB16_15: # %cond.store13 ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB16_16: # %else14 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $8, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $256, %eax # imm = 0x100 ; SSE4-NEXT: je .LBB16_18 -; SSE4-NEXT: # %bb.17: # %cond.store15 +; SSE4-NEXT: .LBB16_17: # %cond.store15 ; SSE4-NEXT: pextrb $8, %xmm0, 8(%rdi) -; SSE4-NEXT: .LBB16_18: # %else16 -; SSE4-NEXT: pextrb $9, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $512, %eax # imm = 0x200 ; SSE4-NEXT: je .LBB16_20 -; SSE4-NEXT: # %bb.19: # %cond.store17 +; SSE4-NEXT: .LBB16_19: # %cond.store17 ; SSE4-NEXT: pextrb $9, %xmm0, 9(%rdi) -; SSE4-NEXT: .LBB16_20: # %else18 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $10, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $1024, %eax # imm = 0x400 ; SSE4-NEXT: je .LBB16_22 -; SSE4-NEXT: # %bb.21: # %cond.store19 +; SSE4-NEXT: .LBB16_21: # %cond.store19 ; SSE4-NEXT: pextrb $10, %xmm0, 10(%rdi) -; SSE4-NEXT: .LBB16_22: # %else20 -; SSE4-NEXT: pextrb $11, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $2048, %eax # imm = 0x800 ; SSE4-NEXT: je .LBB16_24 -; SSE4-NEXT: # %bb.23: # %cond.store21 +; SSE4-NEXT: .LBB16_23: # %cond.store21 ; SSE4-NEXT: pextrb $11, %xmm0, 11(%rdi) -; SSE4-NEXT: .LBB16_24: # %else22 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE4-NEXT: pextrb $12, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE4-NEXT: je .LBB16_26 -; SSE4-NEXT: # %bb.25: # %cond.store23 +; SSE4-NEXT: .LBB16_25: # %cond.store23 ; SSE4-NEXT: pextrb $12, %xmm0, 12(%rdi) -; SSE4-NEXT: .LBB16_26: # %else24 -; SSE4-NEXT: pextrb $13, %xmm1, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE4-NEXT: je .LBB16_28 -; SSE4-NEXT: # %bb.27: # %cond.store25 +; SSE4-NEXT: .LBB16_27: # %cond.store25 ; SSE4-NEXT: pextrb $13, %xmm0, 13(%rdi) -; SSE4-NEXT: .LBB16_28: # %else26 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE4-NEXT: je .LBB16_30 -; SSE4-NEXT: # %bb.29: # %cond.store27 +; SSE4-NEXT: .LBB16_29: # %cond.store27 ; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi) -; SSE4-NEXT: .LBB16_30: # %else28 -; SSE4-NEXT: pextrb $15, %xmm2, %eax -; SSE4-NEXT: notb %al -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: je .LBB16_32 -; SSE4-NEXT: # %bb.31: # %cond.store29 +; SSE4-NEXT: .LBB16_31: # %cond.store29 ; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi) -; SSE4-NEXT: .LBB16_32: # %else30 ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: notb %al +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: je .LBB16_2 -; AVX1-NEXT: # %bb.1: # %cond.store +; AVX1-NEXT: jne .LBB16_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB16_3 +; AVX1-NEXT: .LBB16_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB16_5 +; AVX1-NEXT: .LBB16_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB16_7 +; AVX1-NEXT: .LBB16_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB16_9 +; AVX1-NEXT: .LBB16_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB16_11 +; AVX1-NEXT: .LBB16_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB16_13 +; AVX1-NEXT: .LBB16_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB16_15 +; AVX1-NEXT: .LBB16_16: # %else14 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 +; AVX1-NEXT: jne .LBB16_17 +; AVX1-NEXT: .LBB16_18: # %else16 +; AVX1-NEXT: testl $512, %eax # imm = 0x200 +; AVX1-NEXT: jne .LBB16_19 +; AVX1-NEXT: .LBB16_20: # %else18 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 +; AVX1-NEXT: jne .LBB16_21 +; AVX1-NEXT: .LBB16_22: # %else20 +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 +; AVX1-NEXT: jne .LBB16_23 +; AVX1-NEXT: .LBB16_24: # %else22 +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX1-NEXT: jne .LBB16_25 +; AVX1-NEXT: .LBB16_26: # %else24 +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX1-NEXT: jne .LBB16_27 +; AVX1-NEXT: .LBB16_28: # %else26 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX1-NEXT: jne .LBB16_29 +; AVX1-NEXT: .LBB16_30: # %else28 +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: jne .LBB16_31 +; AVX1-NEXT: .LBB16_32: # %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB16_1: # %cond.store ; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: .LBB16_2: # %else -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB16_4 -; AVX1-NEXT: # %bb.3: # %cond.store1 +; AVX1-NEXT: .LBB16_3: # %cond.store1 ; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: .LBB16_4: # %else2 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB16_6 -; AVX1-NEXT: # %bb.5: # %cond.store3 +; AVX1-NEXT: .LBB16_5: # %cond.store3 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: .LBB16_6: # %else4 -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB16_8 -; AVX1-NEXT: # %bb.7: # %cond.store5 +; AVX1-NEXT: .LBB16_7: # %cond.store5 ; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: .LBB16_8: # %else6 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB16_10 -; AVX1-NEXT: # %bb.9: # %cond.store7 +; AVX1-NEXT: .LBB16_9: # %cond.store7 ; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX1-NEXT: .LBB16_10: # %else8 -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB16_12 -; AVX1-NEXT: # %bb.11: # %cond.store9 +; AVX1-NEXT: .LBB16_11: # %cond.store9 ; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX1-NEXT: .LBB16_12: # %else10 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB16_14 -; AVX1-NEXT: # %bb.13: # %cond.store11 +; AVX1-NEXT: .LBB16_13: # %cond.store11 ; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX1-NEXT: .LBB16_14: # %else12 -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB16_16 -; AVX1-NEXT: # %bb.15: # %cond.store13 +; AVX1-NEXT: .LBB16_15: # %cond.store13 ; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX1-NEXT: .LBB16_16: # %else14 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: je .LBB16_18 -; AVX1-NEXT: # %bb.17: # %cond.store15 +; AVX1-NEXT: .LBB16_17: # %cond.store15 ; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX1-NEXT: .LBB16_18: # %else16 -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB16_20 -; AVX1-NEXT: # %bb.19: # %cond.store17 +; AVX1-NEXT: .LBB16_19: # %cond.store17 ; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX1-NEXT: .LBB16_20: # %else18 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB16_22 -; AVX1-NEXT: # %bb.21: # %cond.store19 +; AVX1-NEXT: .LBB16_21: # %cond.store19 ; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX1-NEXT: .LBB16_22: # %else20 -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $2048, %eax # imm = 0x800 ; AVX1-NEXT: je .LBB16_24 -; AVX1-NEXT: # %bb.23: # %cond.store21 +; AVX1-NEXT: .LBB16_23: # %cond.store21 ; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX1-NEXT: .LBB16_24: # %else22 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX1-NEXT: je .LBB16_26 -; AVX1-NEXT: # %bb.25: # %cond.store23 +; AVX1-NEXT: .LBB16_25: # %cond.store23 ; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX1-NEXT: .LBB16_26: # %else24 -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX1-NEXT: je .LBB16_28 -; AVX1-NEXT: # %bb.27: # %cond.store25 +; AVX1-NEXT: .LBB16_27: # %cond.store25 ; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX1-NEXT: .LBB16_28: # %else26 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: je .LBB16_30 -; AVX1-NEXT: # %bb.29: # %cond.store27 +; AVX1-NEXT: .LBB16_29: # %cond.store27 ; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX1-NEXT: .LBB16_30: # %else28 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX1-NEXT: je .LBB16_32 -; AVX1-NEXT: # %bb.31: # %cond.store29 +; AVX1-NEXT: .LBB16_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX1-NEXT: .LBB16_32: # %else30 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: truncstore_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: notb %al +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: je .LBB16_2 -; AVX2-NEXT: # %bb.1: # %cond.store +; AVX2-NEXT: jne .LBB16_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB16_3 +; AVX2-NEXT: .LBB16_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB16_5 +; AVX2-NEXT: .LBB16_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB16_7 +; AVX2-NEXT: .LBB16_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB16_9 +; AVX2-NEXT: .LBB16_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB16_11 +; AVX2-NEXT: .LBB16_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB16_13 +; AVX2-NEXT: .LBB16_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB16_15 +; AVX2-NEXT: .LBB16_16: # %else14 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 +; AVX2-NEXT: jne .LBB16_17 +; AVX2-NEXT: .LBB16_18: # %else16 +; AVX2-NEXT: testl $512, %eax # imm = 0x200 +; AVX2-NEXT: jne .LBB16_19 +; AVX2-NEXT: .LBB16_20: # %else18 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 +; AVX2-NEXT: jne .LBB16_21 +; AVX2-NEXT: .LBB16_22: # %else20 +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 +; AVX2-NEXT: jne .LBB16_23 +; AVX2-NEXT: .LBB16_24: # %else22 +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX2-NEXT: jne .LBB16_25 +; AVX2-NEXT: .LBB16_26: # %else24 +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX2-NEXT: jne .LBB16_27 +; AVX2-NEXT: .LBB16_28: # %else26 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX2-NEXT: jne .LBB16_29 +; AVX2-NEXT: .LBB16_30: # %else28 +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: jne .LBB16_31 +; AVX2-NEXT: .LBB16_32: # %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB16_1: # %cond.store ; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: .LBB16_2: # %else -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB16_4 -; AVX2-NEXT: # %bb.3: # %cond.store1 +; AVX2-NEXT: .LBB16_3: # %cond.store1 ; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: .LBB16_4: # %else2 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB16_6 -; AVX2-NEXT: # %bb.5: # %cond.store3 +; AVX2-NEXT: .LBB16_5: # %cond.store3 ; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: .LBB16_6: # %else4 -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB16_8 -; AVX2-NEXT: # %bb.7: # %cond.store5 +; AVX2-NEXT: .LBB16_7: # %cond.store5 ; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: .LBB16_8: # %else6 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB16_10 -; AVX2-NEXT: # %bb.9: # %cond.store7 +; AVX2-NEXT: .LBB16_9: # %cond.store7 ; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX2-NEXT: .LBB16_10: # %else8 -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB16_12 -; AVX2-NEXT: # %bb.11: # %cond.store9 +; AVX2-NEXT: .LBB16_11: # %cond.store9 ; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX2-NEXT: .LBB16_12: # %else10 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB16_14 -; AVX2-NEXT: # %bb.13: # %cond.store11 +; AVX2-NEXT: .LBB16_13: # %cond.store11 ; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX2-NEXT: .LBB16_14: # %else12 -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB16_16 -; AVX2-NEXT: # %bb.15: # %cond.store13 +; AVX2-NEXT: .LBB16_15: # %cond.store13 ; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX2-NEXT: .LBB16_16: # %else14 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: je .LBB16_18 -; AVX2-NEXT: # %bb.17: # %cond.store15 +; AVX2-NEXT: .LBB16_17: # %cond.store15 ; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX2-NEXT: .LBB16_18: # %else16 -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $512, %eax # imm = 0x200 ; AVX2-NEXT: je .LBB16_20 -; AVX2-NEXT: # %bb.19: # %cond.store17 +; AVX2-NEXT: .LBB16_19: # %cond.store17 ; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX2-NEXT: .LBB16_20: # %else18 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB16_22 -; AVX2-NEXT: # %bb.21: # %cond.store19 +; AVX2-NEXT: .LBB16_21: # %cond.store19 ; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX2-NEXT: .LBB16_22: # %else20 -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $2048, %eax # imm = 0x800 ; AVX2-NEXT: je .LBB16_24 -; AVX2-NEXT: # %bb.23: # %cond.store21 +; AVX2-NEXT: .LBB16_23: # %cond.store21 ; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX2-NEXT: .LBB16_24: # %else22 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX2-NEXT: je .LBB16_26 -; AVX2-NEXT: # %bb.25: # %cond.store23 +; AVX2-NEXT: .LBB16_25: # %cond.store23 ; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX2-NEXT: .LBB16_26: # %else24 -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX2-NEXT: je .LBB16_28 -; AVX2-NEXT: # %bb.27: # %cond.store25 +; AVX2-NEXT: .LBB16_27: # %cond.store25 ; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX2-NEXT: .LBB16_28: # %else26 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: je .LBB16_30 -; AVX2-NEXT: # %bb.29: # %cond.store27 +; AVX2-NEXT: .LBB16_29: # %cond.store27 ; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX2-NEXT: .LBB16_30: # %else28 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: je .LBB16_32 -; AVX2-NEXT: # %bb.31: # %cond.store29 +; AVX2-NEXT: .LBB16_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX2-NEXT: .LBB16_32: # %else30 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v16i16_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpmovmskb %xmm1, %eax +; AVX512F-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB16_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB16_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB16_3 +; AVX512F-NEXT: .LBB16_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB16_5 +; AVX512F-NEXT: .LBB16_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB16_7 +; AVX512F-NEXT: .LBB16_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB16_9 +; AVX512F-NEXT: .LBB16_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB16_11 +; AVX512F-NEXT: .LBB16_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB16_13 +; AVX512F-NEXT: .LBB16_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB16_15 +; AVX512F-NEXT: .LBB16_16: # %else14 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 +; AVX512F-NEXT: jne .LBB16_17 +; AVX512F-NEXT: .LBB16_18: # %else16 +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 +; AVX512F-NEXT: jne .LBB16_19 +; AVX512F-NEXT: .LBB16_20: # %else18 +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512F-NEXT: jne .LBB16_21 +; AVX512F-NEXT: .LBB16_22: # %else20 +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512F-NEXT: jne .LBB16_23 +; AVX512F-NEXT: .LBB16_24: # %else22 +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512F-NEXT: jne .LBB16_25 +; AVX512F-NEXT: .LBB16_26: # %else24 +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512F-NEXT: jne .LBB16_27 +; AVX512F-NEXT: .LBB16_28: # %else26 +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512F-NEXT: jne .LBB16_29 +; AVX512F-NEXT: .LBB16_30: # %else28 +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512F-NEXT: jne .LBB16_31 +; AVX512F-NEXT: .LBB16_32: # %else30 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB16_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB16_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB16_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB16_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB16_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB16_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB16_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB16_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB16_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB16_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB16_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB16_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB16_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB16_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB16_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB16_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB16_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB16_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB16_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB16_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB16_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB16_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB16_16: # %else14 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: je .LBB16_18 -; AVX512F-NEXT: # %bb.17: # %cond.store15 +; AVX512F-NEXT: .LBB16_17: # %cond.store15 ; AVX512F-NEXT: vpextrb $8, %xmm0, 8(%rdi) -; AVX512F-NEXT: .LBB16_18: # %else16 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $9, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $512, %eax # imm = 0x200 ; AVX512F-NEXT: je .LBB16_20 -; AVX512F-NEXT: # %bb.19: # %cond.store17 +; AVX512F-NEXT: .LBB16_19: # %cond.store17 ; AVX512F-NEXT: vpextrb $9, %xmm0, 9(%rdi) -; AVX512F-NEXT: .LBB16_20: # %else18 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $10, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $1024, %eax # imm = 0x400 ; AVX512F-NEXT: je .LBB16_22 -; AVX512F-NEXT: # %bb.21: # %cond.store19 +; AVX512F-NEXT: .LBB16_21: # %cond.store19 ; AVX512F-NEXT: vpextrb $10, %xmm0, 10(%rdi) -; AVX512F-NEXT: .LBB16_22: # %else20 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $11, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $2048, %eax # imm = 0x800 ; AVX512F-NEXT: je .LBB16_24 -; AVX512F-NEXT: # %bb.23: # %cond.store21 +; AVX512F-NEXT: .LBB16_23: # %cond.store21 ; AVX512F-NEXT: vpextrb $11, %xmm0, 11(%rdi) -; AVX512F-NEXT: .LBB16_24: # %else22 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512F-NEXT: je .LBB16_26 -; AVX512F-NEXT: # %bb.25: # %cond.store23 +; AVX512F-NEXT: .LBB16_25: # %cond.store23 ; AVX512F-NEXT: vpextrb $12, %xmm0, 12(%rdi) -; AVX512F-NEXT: .LBB16_26: # %else24 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $13, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $8192, %eax # imm = 0x2000 ; AVX512F-NEXT: je .LBB16_28 -; AVX512F-NEXT: # %bb.27: # %cond.store25 +; AVX512F-NEXT: .LBB16_27: # %cond.store25 ; AVX512F-NEXT: vpextrb $13, %xmm0, 13(%rdi) -; AVX512F-NEXT: .LBB16_28: # %else26 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512F-NEXT: je .LBB16_30 -; AVX512F-NEXT: # %bb.29: # %cond.store27 +; AVX512F-NEXT: .LBB16_29: # %cond.store27 ; AVX512F-NEXT: vpextrb $14, %xmm0, 14(%rdi) -; AVX512F-NEXT: .LBB16_30: # %else28 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testl $32768, %eax # imm = 0x8000 ; AVX512F-NEXT: je .LBB16_32 -; AVX512F-NEXT: # %bb.31: # %cond.store29 +; AVX512F-NEXT: .LBB16_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) -; AVX512F-NEXT: .LBB16_32: # %else30 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7895,315 +7041,277 @@ ; SSE2-LABEL: truncstore_v8i16_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: je .LBB17_2 -; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movb %al, (%rdi) -; SSE2-NEXT: .LBB17_2: # %else -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: je .LBB17_4 -; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 1(%rdi) -; SSE2-NEXT: .LBB17_4: # %else2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: packsswb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_6 -; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: jne .LBB17_1 +; SSE2-NEXT: # %bb.2: # %else +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: jne .LBB17_3 +; SSE2-NEXT: .LBB17_4: # %else2 +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: jne .LBB17_5 ; SSE2-NEXT: .LBB17_6: # %else4 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_8 -; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: movb %al, 3(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: jne .LBB17_7 ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_10 -; SSE2-NEXT: # %bb.9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: movb %al, 4(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: jne .LBB17_9 ; SSE2-NEXT: .LBB17_10: # %else8 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_12 -; SSE2-NEXT: # %bb.11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: movb %al, 5(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: jne .LBB17_11 ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pextrw $6, %xmm2, %eax -; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB17_14 -; SSE2-NEXT: # %bb.13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: movb %al, 6(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: .LBB17_14: # %else12 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $-128, %al +; SSE2-NEXT: jne .LBB17_15 +; SSE2-NEXT: .LBB17_16: # %else14 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB17_1: # %cond.store +; SSE2-NEXT: movb %cl, (%rdi) +; SSE2-NEXT: testb $2, %al +; SSE2-NEXT: je .LBB17_4 +; SSE2-NEXT: .LBB17_3: # %cond.store1 +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 1(%rdi) +; SSE2-NEXT: testb $4, %al +; SSE2-NEXT: je .LBB17_6 +; SSE2-NEXT: .LBB17_5: # %cond.store3 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 2(%rdi) +; SSE2-NEXT: testb $8, %al +; SSE2-NEXT: je .LBB17_8 +; SSE2-NEXT: .LBB17_7: # %cond.store5 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 3(%rdi) +; SSE2-NEXT: testb $16, %al +; SSE2-NEXT: je .LBB17_10 +; SSE2-NEXT: .LBB17_9: # %cond.store7 +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 4(%rdi) +; SSE2-NEXT: testb $32, %al +; SSE2-NEXT: je .LBB17_12 +; SSE2-NEXT: .LBB17_11: # %cond.store9 +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 5(%rdi) +; SSE2-NEXT: testb $64, %al +; SSE2-NEXT: je .LBB17_14 +; SSE2-NEXT: .LBB17_13: # %cond.store11 +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: movb %cl, 6(%rdi) +; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB17_16 -; SSE2-NEXT: # %bb.15: # %cond.store13 +; SSE2-NEXT: .LBB17_15: # %cond.store13 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: movb %al, 7(%rdi) -; SSE2-NEXT: .LBB17_16: # %else14 ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i16_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 +; SSE4-NEXT: pxor %xmm2, %xmm2 ; SSE4-NEXT: pminuw {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pextrb $0, %xmm2, %eax +; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm1 +; SSE4-NEXT: packsswb %xmm0, %xmm1 +; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: testb $1, %al -; SSE4-NEXT: je .LBB17_2 -; SSE4-NEXT: # %bb.1: # %cond.store +; SSE4-NEXT: jne .LBB17_1 +; SSE4-NEXT: # %bb.2: # %else +; SSE4-NEXT: testb $2, %al +; SSE4-NEXT: jne .LBB17_3 +; SSE4-NEXT: .LBB17_4: # %else2 +; SSE4-NEXT: testb $4, %al +; SSE4-NEXT: jne .LBB17_5 +; SSE4-NEXT: .LBB17_6: # %else4 +; SSE4-NEXT: testb $8, %al +; SSE4-NEXT: jne .LBB17_7 +; SSE4-NEXT: .LBB17_8: # %else6 +; SSE4-NEXT: testb $16, %al +; SSE4-NEXT: jne .LBB17_9 +; SSE4-NEXT: .LBB17_10: # %else8 +; SSE4-NEXT: testb $32, %al +; SSE4-NEXT: jne .LBB17_11 +; SSE4-NEXT: .LBB17_12: # %else10 +; SSE4-NEXT: testb $64, %al +; SSE4-NEXT: jne .LBB17_13 +; SSE4-NEXT: .LBB17_14: # %else12 +; SSE4-NEXT: testb $-128, %al +; SSE4-NEXT: jne .LBB17_15 +; SSE4-NEXT: .LBB17_16: # %else14 +; SSE4-NEXT: retq +; SSE4-NEXT: .LBB17_1: # %cond.store ; SSE4-NEXT: pextrb $0, %xmm0, (%rdi) -; SSE4-NEXT: .LBB17_2: # %else -; SSE4-NEXT: pextrb $2, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB17_4 -; SSE4-NEXT: # %bb.3: # %cond.store1 +; SSE4-NEXT: .LBB17_3: # %cond.store1 ; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi) -; SSE4-NEXT: .LBB17_4: # %else2 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $4, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB17_6 -; SSE4-NEXT: # %bb.5: # %cond.store3 +; SSE4-NEXT: .LBB17_5: # %cond.store3 ; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi) -; SSE4-NEXT: .LBB17_6: # %else4 -; SSE4-NEXT: pextrb $6, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB17_8 -; SSE4-NEXT: # %bb.7: # %cond.store5 +; SSE4-NEXT: .LBB17_7: # %cond.store5 ; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi) -; SSE4-NEXT: .LBB17_8: # %else6 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpeqw %xmm1, %xmm3 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm2 -; SSE4-NEXT: pextrb $8, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB17_10 -; SSE4-NEXT: # %bb.9: # %cond.store7 +; SSE4-NEXT: .LBB17_9: # %cond.store7 ; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi) -; SSE4-NEXT: .LBB17_10: # %else8 -; SSE4-NEXT: pextrb $10, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB17_12 -; SSE4-NEXT: # %bb.11: # %cond.store9 +; SSE4-NEXT: .LBB17_11: # %cond.store9 ; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi) -; SSE4-NEXT: .LBB17_12: # %else10 -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm2 -; SSE4-NEXT: pextrb $12, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB17_14 -; SSE4-NEXT: # %bb.13: # %cond.store11 +; SSE4-NEXT: .LBB17_13: # %cond.store11 ; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi) -; SSE4-NEXT: .LBB17_14: # %else12 -; SSE4-NEXT: pextrb $14, %xmm2, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB17_16 -; SSE4-NEXT: # %bb.15: # %cond.store13 +; SSE4-NEXT: .LBB17_15: # %cond.store13 ; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi) -; SSE4-NEXT: .LBB17_16: # %else14 ; SSE4-NEXT: retq ; ; AVX-LABEL: truncstore_v8i16_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpmovmskb %xmm1, %eax ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB17_2 -; AVX-NEXT: # %bb.1: # %cond.store +; AVX-NEXT: jne .LBB17_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB17_3 +; AVX-NEXT: .LBB17_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB17_5 +; AVX-NEXT: .LBB17_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB17_7 +; AVX-NEXT: .LBB17_8: # %else6 +; AVX-NEXT: testb $16, %al +; AVX-NEXT: jne .LBB17_9 +; AVX-NEXT: .LBB17_10: # %else8 +; AVX-NEXT: testb $32, %al +; AVX-NEXT: jne .LBB17_11 +; AVX-NEXT: .LBB17_12: # %else10 +; AVX-NEXT: testb $64, %al +; AVX-NEXT: jne .LBB17_13 +; AVX-NEXT: .LBB17_14: # %else12 +; AVX-NEXT: testb $-128, %al +; AVX-NEXT: jne .LBB17_15 +; AVX-NEXT: .LBB17_16: # %else14 +; AVX-NEXT: retq +; AVX-NEXT: .LBB17_1: # %cond.store ; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: .LBB17_2: # %else -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $2, %al ; AVX-NEXT: je .LBB17_4 -; AVX-NEXT: # %bb.3: # %cond.store1 +; AVX-NEXT: .LBB17_3: # %cond.store1 ; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX-NEXT: .LBB17_4: # %else2 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $4, %al ; AVX-NEXT: je .LBB17_6 -; AVX-NEXT: # %bb.5: # %cond.store3 +; AVX-NEXT: .LBB17_5: # %cond.store3 ; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX-NEXT: .LBB17_6: # %else4 -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $8, %al ; AVX-NEXT: je .LBB17_8 -; AVX-NEXT: # %bb.7: # %cond.store5 +; AVX-NEXT: .LBB17_7: # %cond.store5 ; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX-NEXT: .LBB17_8: # %else6 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $16, %al ; AVX-NEXT: je .LBB17_10 -; AVX-NEXT: # %bb.9: # %cond.store7 +; AVX-NEXT: .LBB17_9: # %cond.store7 ; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX-NEXT: .LBB17_10: # %else8 -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $32, %al ; AVX-NEXT: je .LBB17_12 -; AVX-NEXT: # %bb.11: # %cond.store9 +; AVX-NEXT: .LBB17_11: # %cond.store9 ; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX-NEXT: .LBB17_12: # %else10 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $64, %al ; AVX-NEXT: je .LBB17_14 -; AVX-NEXT: # %bb.13: # %cond.store11 +; AVX-NEXT: .LBB17_13: # %cond.store11 ; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX-NEXT: .LBB17_14: # %else12 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: testb $1, %al +; AVX-NEXT: testb $-128, %al ; AVX-NEXT: je .LBB17_16 -; AVX-NEXT: # %bb.15: # %cond.store13 +; AVX-NEXT: .LBB17_15: # %cond.store13 ; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX-NEXT: .LBB17_16: # %else14 ; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v8i16_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB17_2 -; AVX512F-NEXT: # %bb.1: # %cond.store +; AVX512F-NEXT: jne .LBB17_1 +; AVX512F-NEXT: # %bb.2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: jne .LBB17_3 +; AVX512F-NEXT: .LBB17_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: jne .LBB17_5 +; AVX512F-NEXT: .LBB17_6: # %else4 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB17_7 +; AVX512F-NEXT: .LBB17_8: # %else6 +; AVX512F-NEXT: testb $16, %al +; AVX512F-NEXT: jne .LBB17_9 +; AVX512F-NEXT: .LBB17_10: # %else8 +; AVX512F-NEXT: testb $32, %al +; AVX512F-NEXT: jne .LBB17_11 +; AVX512F-NEXT: .LBB17_12: # %else10 +; AVX512F-NEXT: testb $64, %al +; AVX512F-NEXT: jne .LBB17_13 +; AVX512F-NEXT: .LBB17_14: # %else12 +; AVX512F-NEXT: testb $-128, %al +; AVX512F-NEXT: jne .LBB17_15 +; AVX512F-NEXT: .LBB17_16: # %else14 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB17_1: # %cond.store ; AVX512F-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX512F-NEXT: .LBB17_2: # %else -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $2, %al ; AVX512F-NEXT: je .LBB17_4 -; AVX512F-NEXT: # %bb.3: # %cond.store1 +; AVX512F-NEXT: .LBB17_3: # %cond.store1 ; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi) -; AVX512F-NEXT: .LBB17_4: # %else2 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: je .LBB17_6 -; AVX512F-NEXT: # %bb.5: # %cond.store3 +; AVX512F-NEXT: .LBB17_5: # %cond.store3 ; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi) -; AVX512F-NEXT: .LBB17_6: # %else4 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $3, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $8, %al ; AVX512F-NEXT: je .LBB17_8 -; AVX512F-NEXT: # %bb.7: # %cond.store5 +; AVX512F-NEXT: .LBB17_7: # %cond.store5 ; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi) -; AVX512F-NEXT: .LBB17_8: # %else6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm3 -; AVX512F-NEXT: vpmovsxwq %xmm3, %zmm3 -; AVX512F-NEXT: vptestmq %zmm3, %zmm3, %k0 -; AVX512F-NEXT: kshiftrw $4, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $16, %al ; AVX512F-NEXT: je .LBB17_10 -; AVX512F-NEXT: # %bb.9: # %cond.store7 +; AVX512F-NEXT: .LBB17_9: # %cond.store7 ; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi) -; AVX512F-NEXT: .LBB17_10: # %else8 -; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $5, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $32, %al ; AVX512F-NEXT: je .LBB17_12 -; AVX512F-NEXT: # %bb.11: # %cond.store9 +; AVX512F-NEXT: .LBB17_11: # %cond.store9 ; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi) -; AVX512F-NEXT: .LBB17_12: # %else10 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 -; AVX512F-NEXT: kshiftrw $6, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $64, %al ; AVX512F-NEXT: je .LBB17_14 -; AVX512F-NEXT: # %bb.13: # %cond.store11 +; AVX512F-NEXT: .LBB17_13: # %cond.store11 ; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi) -; AVX512F-NEXT: .LBB17_14: # %else12 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftrw $7, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $-128, %al ; AVX512F-NEXT: je .LBB17_16 -; AVX512F-NEXT: # %bb.15: # %cond.store13 +; AVX512F-NEXT: .LBB17_15: # %cond.store13 ; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi) -; AVX512F-NEXT: .LBB17_16: # %else14 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; Index: llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll =================================================================== --- llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll +++ llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll @@ -3,8 +3,10 @@ define <2 x i64> @scalarize_v2i64(<2 x i64*> %p, <2 x i1> %mask, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64( -; CHECK-NEXT: [[MASK0:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 -; CHECK-NEXT: br i1 [[MASK0]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2 +; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] ; CHECK: cond.load: ; CHECK-NEXT: [[PTR0:%.*]] = extractelement <2 x i64*> [[P:%.*]], i64 0 ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[PTR0]], align 8 @@ -12,8 +14,9 @@ ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[RES0]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 -; CHECK-NEXT: br i1 [[MASK1]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i2 [[SCALAR_MASK]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i2 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.load1: ; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x i64*> [[P]], i64 1 ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[PTR1]], align 8 Index: llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll =================================================================== --- llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll +++ llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll @@ -4,24 +4,27 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2 +; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] ; CHECK: cond.load: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP5]], i64 0 ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: -; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 -; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.load1: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP10]], i64 1 ; CHECK-NEXT: br label [[ELSE2]] ; CHECK: else2: -; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] ; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 128, <2 x i1> %mask, <2 x i64> %passthru) @@ -62,24 +65,27 @@ define <2 x i24> @scalarize_v2i24(<2 x i24>* %p, <2 x i1> %mask, <2 x i24> %passthru) { ; CHECK-LABEL: @scalarize_v2i24( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i24>* [[P:%.*]] to i24* -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2 +; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] ; CHECK: cond.load: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = load i24, i24* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = load i24, i24* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP5]], i64 0 ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: -; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 -; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.load1: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load i24, i24* [[TMP7]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i24, i24* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i24, i24* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP10]], i64 1 ; CHECK-NEXT: br label [[ELSE2]] ; CHECK: else2: -; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] ; CHECK-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]] ; %ret = call <2 x i24> @llvm.masked.load.v2i24.p0v2i24(<2 x i24>* %p, i32 8, <2 x i1> %mask, <2 x i24> %passthru) @@ -90,24 +96,27 @@ define <2 x i48> @scalarize_v2i48(<2 x i48>* %p, <2 x i1> %mask, <2 x i48> %passthru) { ; CHECK-LABEL: @scalarize_v2i48( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i48>* [[P:%.*]] to i48* -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2 +; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] ; CHECK: cond.load: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = load i48, i48* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = load i48, i48* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP5]], i64 0 ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: -; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 -; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP6]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = and i2 [[SCALAR_MASK]], -2 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i2 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.load1: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load i48, i48* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i48, i48* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i48, i48* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP10]], i64 1 ; CHECK-NEXT: br label [[ELSE2]] ; CHECK: else2: -; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP11]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] ; CHECK-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]] ; %ret = call <2 x i48> @llvm.masked.load.v2i48.p0v2i48(<2 x i48>* %p, i32 16, <2 x i1> %mask, <2 x i48> %passthru) Index: llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll =================================================================== --- llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll +++ llvm/trunk/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll @@ -4,20 +4,23 @@ define void @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %data) { ; CHECK-LABEL: @scalarize_v2i64( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2 +; CHECK-NEXT: [[TMP2:%.*]] = and i2 [[SCALAR_MASK]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i2 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[ELSE:%.*]] ; CHECK: cond.store: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 -; CHECK-NEXT: store i64 [[TMP3]], i64* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP4]], i64* [[TMP5]], align 8 ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.store1: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 -; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 +; CHECK-NEXT: store i64 [[TMP8]], i64* [[TMP9]], align 8 ; CHECK-NEXT: br label [[ELSE2]] ; CHECK: else2: ; CHECK-NEXT: ret void