Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -2117,13 +2117,25 @@ LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); } - // TODO: Instead of comparing ULT, just subtract and return the difference? - Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Type *I32 = Builder.getInt32Ty(); - Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1), - ConstantInt::get(I32, 1)); - return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0)); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); } // This function expands the memcmp call into an inline expansion and returns Index: llvm/trunk/test/CodeGen/PowerPC/memcmp.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memcmp.ll +++ llvm/trunk/test/CodeGen/PowerPC/memcmp.ll @@ -6,11 +6,13 @@ ; CHECK: # BB#0: ; CHECK-NEXT: ldbrx 3, 0, 3 ; CHECK-NEXT: ldbrx 4, 0, 4 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: li 12, -1 +; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: isel 3, 12, 5, 0 -; CHECK-NEXT: isel 3, 0, 3, 2 +; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: isel 4, 3, 5, 1 +; CHECK-NEXT: isel 3, 3, 5, 0 +; CHECK-NEXT: subf 3, 3, 4 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %t0 = bitcast i32* %buffer1 to i8* %t1 = bitcast i32* %buffer2 to i8* @@ -23,11 +25,12 @@ ; CHECK: # BB#0: ; CHECK-NEXT: lwbrx 3, 0, 3 ; CHECK-NEXT: lwbrx 4, 0, 4 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: li 12, -1 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: isel 3, 12, 5, 0 -; CHECK-NEXT: isel 3, 0, 3, 2 +; CHECK-NEXT: sub 5, 4, 3 +; CHECK-NEXT: sub 3, 3, 4 +; CHECK-NEXT: rldicl 4, 5, 1, 63 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: subf 3, 3, 4 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %t0 = bitcast i32* %buffer1 to i8* %t1 = bitcast i32* %buffer2 to i8* @@ -40,11 +43,8 @@ ; CHECK: # BB#0: ; CHECK-NEXT: lhbrx 3, 0, 3 ; CHECK-NEXT: lhbrx 4, 0, 4 -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: li 12, -1 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: isel 3, 12, 5, 0 -; CHECK-NEXT: isel 3, 0, 3, 2 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %t0 = bitcast i32* %buffer1 to i8* %t1 = bitcast i32* %buffer2 to i8* @@ -57,11 +57,8 @@ ; CHECK: # BB#0: ; CHECK-NEXT: lbz 3, 0(3) ; CHECK-NEXT: lbz 4, 0(4) -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: li 12, -1 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: isel 3, 12, 5, 0 -; CHECK-NEXT: isel 3, 0, 3, 2 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %t0 = bitcast i32* %buffer1 to i8* %t1 = bitcast i32* %buffer2 to i8* Index: llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll +++ llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll @@ -59,20 +59,22 @@ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) - ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ne i32 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]] ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: [[SELECT1:%[0-9]+]] = select i1 [[CMP2]], i32 -1, i32 1 - ; CHECK-NEXT: [[SELECT2:%[0-9]+]] = select i1 [[CMP1]], i32 [[SELECT1]], i32 0 - ; CHECK-NEXT: ret i32 [[SELECT2]] + ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 + ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] + ; CHECK-NEXT: ret i32 [[SUB]] ; CHECK-BE-LABEL: @test2( ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* - ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ne i32 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]] ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: [[SELECT1:%[0-9]+]] = select i1 [[CMP2]], i32 -1, i32 1 - ; CHECK-BE-NEXT: [[SELECT2:%[0-9]+]] = select i1 [[CMP1]], i32 [[SELECT1]], i32 0 - ; CHECK-BE-NEXT: ret i32 [[SELECT2]] + ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 + ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] + ; CHECK-BE-NEXT: ret i32 [[SUB]] entry: %0 = bitcast i32* %buffer1 to i8* Index: llvm/trunk/test/CodeGen/X86/memcmp-optsize.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/memcmp-optsize.ll +++ llvm/trunk/test/CodeGen/X86/memcmp-optsize.ll @@ -14,24 +14,15 @@ define i32 @length2(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length2: ; X86: # BB#0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx ; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %cx ; X86-NEXT: rolw $8, %dx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: incl %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: decl %eax -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: cmovael %edi, %eax -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: length2: @@ -40,12 +31,9 @@ ; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpw %cx, %ax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m @@ -218,38 +206,28 @@ define i32 @length4(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length4: ; X86: # BB#0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: movl (%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: incl %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: decl %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: cmovael %edi, %eax -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax ; X86-NEXT: retl ; ; X64-LABEL: length4: ; X64: # BB#0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: bswapl %eax +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: movl (%rsi), %edx ; X64-NEXT: bswapl %ecx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m @@ -419,16 +397,14 @@ ; ; X64-LABEL: length8: ; X64: # BB#0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx -; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m Index: llvm/trunk/test/CodeGen/X86/memcmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/memcmp.ll +++ llvm/trunk/test/CodeGen/X86/memcmp.ll @@ -17,15 +17,12 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %ax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw %ax, %cx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: cmovbl %ecx, %eax -; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: length2: @@ -34,12 +31,9 @@ ; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpw %cx, %ax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m @@ -211,29 +205,25 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl (%eax), %edx ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: cmovbl %ecx, %eax -; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax ; X86-NEXT: retl ; ; X64-LABEL: length4: ; X64: # BB#0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: bswapl %eax +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: movl (%rsi), %edx ; X64-NEXT: bswapl %ecx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m @@ -399,16 +389,14 @@ ; ; X64-LABEL: length8: ; X64: # BB#0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx -; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/memcmp.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -12,11 +12,10 @@ ; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) ; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP8:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 -; ALL-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 -; ALL-NEXT: ret i32 [[TMP10]] +; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call @@ -93,11 +92,12 @@ ; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] ; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 -; ALL-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 -; ALL-NEXT: ret i32 [[TMP10]] +; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; ALL-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) ret i32 %call @@ -285,11 +285,12 @@ ; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] ; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 -; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 -; X64-NEXT: ret i32 [[TMP10]] +; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) ret i32 %call