Index: lib/CodeGen/ExpandMemCmp.cpp =================================================================== --- lib/CodeGen/ExpandMemCmp.cpp +++ lib/CodeGen/ExpandMemCmp.cpp @@ -564,12 +564,8 @@ // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlockForZeroCmp != 1) || - getNumBlocks() != 1) { + // Create the basic block framework for a multi-block expansion. + if (getNumBlocks() != 1) { BasicBlock *StartBlock = CI->getParent(); EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); setupEndBlockPHINodes(); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -829,6 +829,11 @@ /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; + /// Allow multiple load pairs per block for smaller and faster code. + unsigned getMemcmpEqZeroLoadsPerBlock() const override { + return 2; + } + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; Index: test/CodeGen/X86/memcmp-optsize.ll =================================================================== --- test/CodeGen/X86/memcmp-optsize.ll +++ test/CodeGen/X86/memcmp-optsize.ll @@ -160,35 +160,22 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB5_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl -; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx -; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpb 2(%rsi), %cl -; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB5_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorb 2(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orw %ax, %cx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind @@ -318,35 +305,22 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB10_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl -; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx -; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpb 4(%rsi), %cl -; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB10_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorb 4(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind @@ -404,18 +378,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx -; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx -; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -433,18 +399,12 @@ define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 -; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax -; X86-NEXT: .LBB13_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: retl ; @@ -475,17 +435,10 @@ ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl 8(%rsi), %ecx -; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB14_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorl 8(%rsi), %ecx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind @@ -703,37 +656,25 @@ ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pmovmskb %xmm2, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movq 16(%rdi), %rcx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx -; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB20_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X64-AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: movq 16(%rdi), %rcx -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx -; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB20_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind @@ -757,38 +698,28 @@ ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-SSE2-NEXT: movq %rax, %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 -; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) -; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB21_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-AVX2-NEXT: vmovq %rax, %xmm2 +; X64-AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 -; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) -; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB21_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind @@ -835,47 +766,28 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %edx -; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_2 -; X86-SSE2-NEXT: # %bb.1: # %loadbb1 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_2: # %res_block -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: incl %eax -; X86-SSE2-NEXT: .LBB23_3: # %endblock -; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB23_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; @@ -910,43 +822,24 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_2 -; X86-SSE2-NEXT: # %bb.1: # %loadbb1 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_2: # %res_block -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: incl %eax -; X86-SSE2-NEXT: .LBB24_3: # %endblock -; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB24_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; @@ -1009,21 +902,12 @@ ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: cmpl $-1, %ecx -; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB26_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1059,21 +943,12 @@ ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: cmpl $-1, %ecx -; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB27_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -191,34 +191,22 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB7_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl -; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_2: # %res_block -; X86-NEXT: movl $1, %ecx -; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpb 2(%rsi), %cl -; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB7_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorb 2(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orw %ax, %cx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind @@ -348,34 +336,22 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl -; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: movl $1, %ecx -; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpb 4(%rsi), %cl -; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB12_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorb 4(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind @@ -433,17 +409,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB14_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx -; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_2: # %res_block -; X86-NEXT: movl $1, %ecx -; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -461,17 +430,12 @@ define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 -; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_2: # %res_block -; X86-NEXT: movl $1, %eax -; X86-NEXT: .LBB15_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: retl ; @@ -502,17 +466,10 @@ ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl 8(%rsi), %ecx -; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB16_3: # %endblock -; X64-NEXT: testl %eax, %eax +; X64-NEXT: xorl 8(%rsi), %ecx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind @@ -754,37 +711,25 @@ ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pmovmskb %xmm2, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movq 16(%rdi), %rcx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx -; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB22_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X64-AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_2 -; X64-AVX-NEXT: # %bb.1: # %loadbb1 -; X64-AVX-NEXT: movq 16(%rdi), %rcx -; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: cmpq 16(%rsi), %rcx -; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_2: # %res_block -; X64-AVX-NEXT: movl $1, %eax -; X64-AVX-NEXT: .LBB22_3: # %endblock -; X64-AVX-NEXT: testl %eax, %eax ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind @@ -808,38 +753,28 @@ ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-SSE2-NEXT: movq %rax, %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 -; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) -; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB23_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-AVX-NEXT: vmovq %rax, %xmm2 +; X64-AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_2 -; X64-AVX-NEXT: # %bb.1: # %loadbb1 -; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 -; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) -; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_2: # %res_block -; X64-AVX-NEXT: movl $1, %eax -; X64-AVX-NEXT: .LBB23_3: # %endblock -; X64-AVX-NEXT: testl %eax, %eax ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind @@ -898,67 +833,40 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %edx -; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_2 -; X86-SSE2-NEXT: # %bb.1: # %loadbb1 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_2: # %res_block -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: .LBB25_3: # %endblock -; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB25_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_2 -; X64-AVX1-NEXT: # %bb.1: # %loadbb1 -; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 -; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx -; X64-AVX1-NEXT: xorl %eax, %eax -; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_2: # %res_block -; X64-AVX1-NEXT: movl $1, %eax -; X64-AVX1-NEXT: .LBB25_3: # %endblock -; X64-AVX1-NEXT: testl %eax, %eax ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: retq ; @@ -1005,63 +913,36 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_2 -; X86-SSE2-NEXT: # %bb.1: # %loadbb1 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X86-SSE2-NEXT: xorl %eax, %eax -; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_2: # %res_block -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: .LBB26_3: # %endblock -; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_2 -; X64-SSE2-NEXT: # %bb.1: # %loadbb1 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx -; X64-SSE2-NEXT: xorl %eax, %eax -; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_2: # %res_block -; X64-SSE2-NEXT: movl $1, %eax -; X64-SSE2-NEXT: .LBB26_3: # %endblock -; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_2 -; X64-AVX1-NEXT: # %bb.1: # %loadbb1 -; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 -; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx -; X64-AVX1-NEXT: xorl %eax, %eax -; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_2: # %res_block -; X64-AVX1-NEXT: movl $1, %eax -; X64-AVX1-NEXT: .LBB26_3: # %endblock -; X64-AVX1-NEXT: testl %eax, %eax ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: retq ; @@ -1134,21 +1015,12 @@ ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: cmpl $-1, %ecx -; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB28_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1195,21 +1067,12 @@ ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_2 -; X64-AVX2-NEXT: # %bb.1: # %loadbb1 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: cmpl $-1, %ecx -; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_2: # %res_block -; X64-AVX2-NEXT: movl $1, %eax -; X64-AVX2-NEXT: .LBB29_3: # %endblock -; X64-AVX2-NEXT: testl %eax, %eax ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq Index: test/Transforms/ExpandMemCmp/X86/memcmp.ll =================================================================== --- test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 +; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_1LD +; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_2LD declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) @@ -430,29 +431,69 @@ } define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq3( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i16 +; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i16 +; X32-NEXT: [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq3( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq3( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X64_2LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X64_2LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_2LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_2LD-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i16 +; X64_2LD-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i16 +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) %cmp = icmp eq i32 %call, 0 @@ -479,29 +520,69 @@ } define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq5( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i32 +; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i32 +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq5( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq5( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64_2LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64_2LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_2LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_2LD-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i32 +; X64_2LD-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i32 +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) %cmp = icmp eq i32 %call, 0 @@ -510,31 +591,75 @@ } define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 -; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq6( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X32-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 +; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] +; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq6( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq6( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 +; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 +; X64_2LD-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] +; X64_2LD-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] +; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) %cmp = icmp eq i32 %call, 0 @@ -557,28 +682,22 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: br label [[LOADBB:%.*]] -; X32: res_block: -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: ; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* ; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] ; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* ; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* ; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 ; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 ; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] ; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; @@ -606,29 +725,49 @@ ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; -; X64-LABEL: @cmp_eq9( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] +; X64_1LD-LABEL: @cmp_eq9( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq9( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_2LD-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_2LD-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64_2LD-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64_2LD-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i64 +; X64_2LD-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i64 +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) %cmp = icmp eq i32 %call, 0 @@ -643,31 +782,53 @@ ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; -; X64-LABEL: @cmp_eq10( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 -; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] +; X64_1LD-LABEL: @cmp_eq10( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq10( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i64 +; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i64 +; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]] +; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]] +; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0 +; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) %cmp = icmp eq i32 %call, 0 @@ -695,31 +856,53 @@ ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; -; X64-LABEL: @cmp_eq12( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 -; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] +; X64_1LD-LABEL: @cmp_eq12( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq12( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = zext i32 [[TMP10]] to i64 +; X64_2LD-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 +; X64_2LD-NEXT: [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]] +; X64_2LD-NEXT: [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]] +; X64_2LD-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0 +; X64_2LD-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) %cmp = icmp eq i32 %call, 0