This patch is in a series of patches to provide builtins for compatibility
with the XL compiler. This patch add the builtin and emit target independent
code for __cmpb.
Backend test case (IR) emitting the single instruction cmpb (llvm/test/CodeGen/PowerPC/cmpb.ll)
; RUN: llc -verify-machineinstrs -mtriple powerpc64-unknown-linux-gnu -mcpu pwr7 < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu pwr7 -vec-extabi < %s | FileCheck %s
define i64 @test64(i64 %x, i64 %y) #0 {
; CHECK-LABEL: test64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb 3, 3, 4
; CHECK-NEXT: blr
entry:
%shr19 = lshr i64 %x, 56
%conv21 = trunc i64 %shr19 to i32
%shr43 = lshr i64 %y, 56
%conv45 = trunc i64 %shr43 to i32
%0 = xor i64 %y, %x
%1 = and i64 %0, 255
%cmp = icmp eq i64 %1, 0
%2 = and i64 %0, 65280
%cmp52 = icmp eq i64 %2, 0
%3 = and i64 %0, 16711680
%cmp58 = icmp eq i64 %3, 0
%4 = and i64 %0, 4278190080
%cmp64 = icmp eq i64 %4, 0
%5 = and i64 %0, 1095216660480
%cmp70 = icmp eq i64 %5, 0
%6 = and i64 %0, 280375465082880
%cmp76 = icmp eq i64 %6, 0
%7 = and i64 %0, 71776119061217280
%cmp82 = icmp eq i64 %7, 0
%cmp88 = icmp eq i32 %conv21, %conv45
%conv92 = select i1 %cmp, i64 255, i64 0
%conv93 = select i1 %cmp52, i64 65280, i64 0
%or = or i64 %conv92, %conv93
%conv95 = select i1 %cmp58, i64 16711680, i64 0
%or97 = or i64 %or, %conv95
%conv98 = select i1 %cmp64, i64 4278190080, i64 0
%or100 = or i64 %or97, %conv98
%conv101 = select i1 %cmp70, i64 1095216660480, i64 0
%or103 = or i64 %or100, %conv101
%conv104 = select i1 %cmp76, i64 280375465082880, i64 0
%or106 = or i64 %or103, %conv104
%conv107 = select i1 %cmp82, i64 71776119061217280, i64 0
%or109 = or i64 %or106, %conv107
%conv110 = select i1 %cmp88, i64 -72057594037927936, i64 0
%or112 = or i64 %or109, %conv110
ret i64 %or112
}
I find it rather surprising that we are emitting this complex sequence for this builtin. Perhaps there is a good reason for doing so, but at the very least, this requires a thorough explanation in a comment.
One additional concern I have with this is that if some transformation proves that some portion of this is unused (perhaps using DemandedBits analysis), it may optimize out a portion of this, thereby making the sequence emit a whole bunch of xor's, or's, rotates, etc.
For example:
It is entirely possible that the optimizer will get rid of some of the produced instructions and then the back end won't be able to emit a single cmpb but will have to emit a whole bunch of scalar instructions.