Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2568,6 +2568,40 @@ return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation); } +// Optimized vesrions of 8 bit BITREVERSE. +static SDValue ExpandBITREVERSE8(SelectionDAG &DAG, SDValue Op, + const SDLoc &dl) { + EVT VT = Op.getValueType(); + unsigned Sz = VT.getScalarSizeInBits(); + + assert(Sz == 8 && "Expected 8 bit size"); + + SDValue AL, CL, BL; + APInt Hex1H(Sz, 0x1), Hex2H(Sz, 0x2), Hex4H(Sz, 0x4); + APInt Hex55H(Sz, 0x55), Hex66H(Sz, 0x66); + + SDValue Constant1H(DAG.getConstant(Hex1H, dl, VT)); + SDValue Constant2H(DAG.getConstant(Hex2H, dl, VT)); + SDValue Constant4H(DAG.getConstant(Hex4H, dl, VT)); + SDValue Constant55H(DAG.getConstant(Hex55H, dl, VT)); + SDValue Constant66H(DAG.getConstant(Hex66H, dl, VT)); + + BL = Op; + AL = DAG.getBitcast(VT, BL); // MOV AL,BL + BL = DAG.getNode(ISD::AND, dl, VT, BL, Constant55H); // AND BL,55H + AL = DAG.getNode(ISD::XOR, dl, VT, AL, BL); // XOR AL,BL + BL = DAG.getNode(ISD::ROTL, dl, VT, BL, Constant2H); // ROL BL,2 + AL = DAG.getNode(ISD::OR, dl, VT, BL, AL); // OR AL,BL + CL = DAG.getBitcast(VT, AL); // MOV CL,AL + AL = DAG.getNode(ISD::AND, dl, VT, AL, Constant66H); // AND AL,66H + CL = DAG.getNode(ISD::XOR, dl, VT, AL, CL); // XOR CL,AL + AL = DAG.getNode(ISD::ROTL, dl, VT, AL, Constant4H); // ROL AL,4 + AL = DAG.getNode(ISD::OR, dl, VT, AL, CL); // OR AL,CL + AL = DAG.getNode(ISD::ROTL, dl, VT, AL, Constant1H); // ROL AL,1 + + return AL; +} + /// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts. SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { EVT VT = Op.getValueType(); @@ -2576,6 +2610,9 @@ SDValue Tmp, Tmp2, Tmp3; + if (Sz == 8 && VT.isScalarInteger()) { + return ExpandBITREVERSE8(DAG, Op, dl); + } // If we can, perform BSWAP first and then the mask+swap the i4, then i2 // and finally the i1 pairs. // TODO: We can easily support i4/i2 legal types if any target ever does. Index: test/CodeGen/X86/bitreverse.ll =================================================================== --- test/CodeGen/X86/bitreverse.ll +++ test/CodeGen/X86/bitreverse.ll @@ -323,39 +323,35 @@ define i8 @test_bitreverse_i8(i8 %a) { ; X86-LABEL: test_bitreverse_i8: ; X86: # BB#0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: rolb $4, %al -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $51, %cl -; X86-NEXT: shlb $2, %cl -; X86-NEXT: andb $-52, %al -; X86-NEXT: shrb $2, %al -; X86-NEXT: orb %cl, %al -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $85, %cl -; X86-NEXT: addb %cl, %cl -; X86-NEXT: andb $-86, %al -; X86-NEXT: shrb %al -; X86-NEXT: orb %cl, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), [[AL:%[a-s][i]*[l]]] +; X86-NEXT: movl [[EAX:%[e|r][a-z][x|i]]], [[ECX:%[e|r][a-z][x|i]]] +; X86-NEXT: andb $85, [[CL:%[a-s][i]*[l]]] +; X86-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]] +; X86-NEXT: rolb $2, [[CL]] +; X86-NEXT: orb [[DL]], [[CL]] +; X86-NEXT: movl [[ECX]], [[EAX]] +; X86-NEXT: andb $102, [[AL:%[a-s][i]*[l]]] +; X86-NEXT: xorb [[AL]], [[CL]] +; X86-NEXT: rolb $4, [[AL]] +; X86-NEXT: orb [[CL]], [[AL]] +; X86-NEXT: rolb [[AL]] ; X86-NEXT: retl -; + ; X64-LABEL: test_bitreverse_i8: ; X64: # BB#0: -; X64-NEXT: rolb $4, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $51, %al -; X64-NEXT: shlb $2, %al -; X64-NEXT: andb $-52, %dil -; X64-NEXT: shrb $2, %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $85, %al -; X64-NEXT: addb %al, %al -; X64-NEXT: andb $-86, %dil -; X64-NEXT: shrb %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl [[EDI:%[e|r][a-z][x|i]]], [[ECX:%[e|r][a-z][x|i]]] +; X64-NEXT: andb $85, [[CL:%[a-s][i]*[l]]] +; X64-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]] +; X64-NEXT: rolb $2, [[CL]] +; X64-NEXT: orb [[DL]], [[CL]] +; X64-NEXT: movl [[ECX]], [[EAX:%[e|r][a-z][x|i]]] +; X64-NEXT: andb $102, [[AL:%[a-s][i]*[l]]] +; X64-NEXT: xorb [[AL]], [[CL]] +; X64-NEXT: rolb $4, [[AL]] +; X64-NEXT: orb [[CL]], [[AL]] +; X64-NEXT: rolb [[AL]] ; X64-NEXT: retq + %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b } @@ -365,40 +361,35 @@ define i4 @test_bitreverse_i4(i4 %a) { ; X86-LABEL: test_bitreverse_i4: ; X86: # BB#0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: rolb $4, %al -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $51, %cl -; X86-NEXT: shlb $2, %cl -; X86-NEXT: andb $-52, %al -; X86-NEXT: shrb $2, %al -; X86-NEXT: orb %cl, %al -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $80, %cl -; X86-NEXT: addb %cl, %cl -; X86-NEXT: andb $-96, %al -; X86-NEXT: shrb %al -; X86-NEXT: orb %cl, %al -; X86-NEXT: shrb $4, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), [[AL:%[a-s][i]*[l]]] +; X86-NEXT: movl [[EAX:%[e|r][a-z][x|i]]], [[ECX:%[e|r][a-z][x|i]]] +; X86-NEXT: andb $85, [[CL:%[a-s][i]*[l]]] +; X86-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]] +; X86-NEXT: rolb $2, [[CL]] +; X86-NEXT: orb [[DL]], [[CL]] +; X86-NEXT: movl [[ECX]], [[EAX]] +; X86-NEXT: andb $102, [[AL:%[a-s][i]*[l]]] +; X86-NEXT: xorb [[AL]], [[CL]] +; X86-NEXT: rolb $4, [[AL]] +; X86-NEXT: orb [[CL]], [[AL]] +; X86-NEXT: rolb [[AL]] +; X86-NEXT: shrb $4, [[AL]] ; X86-NEXT: retl ; ; X64-LABEL: test_bitreverse_i4: ; X64: # BB#0: -; X64-NEXT: rolb $4, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $51, %al -; X64-NEXT: shlb $2, %al -; X64-NEXT: andb $-52, %dil -; X64-NEXT: shrb $2, %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $80, %al -; X64-NEXT: addb %al, %al -; X64-NEXT: andb $-96, %dil -; X64-NEXT: shrb %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: shrb $4, %dil -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl [[EDI:%[e|r][a-z][x|i]]], [[ECX:%[e|r][a-z][x|i]]] +; X64-NEXT: andb $85, [[CL:%[a-s][i]*[l]]] +; X64-NEXT: xorb [[CL]], [[DL:%[a-s][i]*[l]]] +; X64-NEXT: rolb $2, [[CL]] +; X64-NEXT: orb [[DL]], [[CL]] +; X64-NEXT: movl [[ECX]], [[EAX:%[e|r][a-z][x|i]]] +; X64-NEXT: andb $102, [[AL:%[a-s][i]*[l]]] +; X64-NEXT: xorb [[AL]], [[CL]] +; X64-NEXT: rolb $4, [[AL]] +; X64-NEXT: orb [[CL]], [[AL]] +; X64-NEXT: rolb [[AL]] +; X64-NEXT: shrb $4, [[AL]] ; X64-NEXT: retq %b = call i4 @llvm.bitreverse.i4(i4 %a) ret i4 %b