This patch is in a series of patches to provide builtins for compatibility
with the XL compiler. This patch add the builtin and emit target independent
code for __cmpb.
Backend test case (IR) emitting the single instruction cmpb (llvm/test/CodeGen/PowerPC/cmpb.ll)
; RUN: llc -verify-machineinstrs -mtriple powerpc64-unknown-linux-gnu -mcpu pwr7 < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu pwr7 -vec-extabi < %s | FileCheck %s
define i64 @test64(i64 %x, i64 %y) #0 {
; CHECK-LABEL: test64:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    cmpb 3, 3, 4
; CHECK-NEXT:    blr
entry:
  %shr19 = lshr i64 %x, 56
  %conv21 = trunc i64 %shr19 to i32
  %shr43 = lshr i64 %y, 56
  %conv45 = trunc i64 %shr43 to i32
  %0 = xor i64 %y, %x
  %1 = and i64 %0, 255
  %cmp = icmp eq i64 %1, 0
  %2 = and i64 %0, 65280
  %cmp52 = icmp eq i64 %2, 0
  %3 = and i64 %0, 16711680
  %cmp58 = icmp eq i64 %3, 0
  %4 = and i64 %0, 4278190080
  %cmp64 = icmp eq i64 %4, 0
  %5 = and i64 %0, 1095216660480
  %cmp70 = icmp eq i64 %5, 0
  %6 = and i64 %0, 280375465082880
  %cmp76 = icmp eq i64 %6, 0
  %7 = and i64 %0, 71776119061217280
  %cmp82 = icmp eq i64 %7, 0
  %cmp88 = icmp eq i32 %conv21, %conv45
  %conv92 = select i1 %cmp, i64 255, i64 0
  %conv93 = select i1 %cmp52, i64 65280, i64 0
  %or = or i64 %conv92, %conv93
  %conv95 = select i1 %cmp58, i64 16711680, i64 0
  %or97 = or i64 %or, %conv95
  %conv98 = select i1 %cmp64, i64 4278190080, i64 0
  %or100 = or i64 %or97, %conv98
  %conv101 = select i1 %cmp70, i64 1095216660480, i64 0
  %or103 = or i64 %or100, %conv101
  %conv104 = select i1 %cmp76, i64 280375465082880, i64 0
  %or106 = or i64 %or103, %conv104
  %conv107 = select i1 %cmp82, i64 71776119061217280, i64 0
  %or109 = or i64 %or106, %conv107
  %conv110 = select i1 %cmp88, i64 -72057594037927936, i64 0
  %or112 = or i64 %or109, %conv110
  ret i64 %or112
}
I find it rather surprising that we are emitting this complex sequence for this builtin. Perhaps there is a good reason for doing so, but at the very least, this requires a thorough explanation in a comment.
One additional concern I have with this is that if some transformation proves that some portion of this is unused (perhaps using DemandedBits analysis), it may optimize out a portion of this, thereby making the sequence emit a whole bunch of xor's, or's, rotates, etc.
For example:
It is entirely possible that the optimizer will get rid of some of the produced instructions and then the back end won't be able to emit a single cmpb but will have to emit a whole bunch of scalar instructions.