Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp =================================================================== --- lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -890,6 +890,34 @@ return getInt64(CurDAG, dl, Imm); } +// Select a 32-bit constant. +static SDNode *getInt32(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { + SDNode *Result; + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + auto getI32Imm = [CurDAG, dl](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + }; + + if (isInt<16>(Imm)) { + // Just the Lo bits. + Result = CurDAG->getMachineNode(PPC::LI, dl, MVT::i32, getI32Imm(Lo)); + } else if (Lo) { + // Handle the Hi bits. + unsigned OpC = Hi ? PPC::LIS : PPC::LI; + Result = CurDAG->getMachineNode(OpC, dl, MVT::i32, getI32Imm(Hi)); + // And Lo bits. + Result = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32, + SDValue(Result, 0), getI32Imm(Lo)); + } else { + // Just the Hi bits. + Result = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32, getI32Imm(Hi)); + } + + return Result; +} + namespace { class BitPermutationSelector { @@ -1978,6 +2006,197 @@ BitGroups.erase(remove_if(BitGroups, F), BitGroups.end()); } + bool isReverseBits() { + if (!Bits[0].hasValue()) + return false; + + SDValue V = Bits[0].getValue(); + unsigned MSB = Bits.size() - 1; + for (unsigned i = 0; i < Bits.size(); ++i) { + if (!Bits[i].hasValue() || (Bits[i].getValue() != V)) + return false; + unsigned VBI = Bits[i].getValueBitIndex(); + if ((i + VBI) != MSB) + return false; + } + return true; + } + + // For reverse bits, we have faster algorithm than bit by bit moves. + SDNode *SelectReverseBits(SDNode *N) { + SDLoc dl(N); + + if (Bits.size() == 32) { + // n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); + SDValue mask1 = SDValue(getInt32(CurDAG, dl, 0x55555555), 0); + SDValue shr1Ops[] = {Bits[0].getValue(), getI32Imm(31, dl), + getI32Imm(1, dl), getI32Imm(31, dl)}; + SDValue shr1 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shr1Ops), 0); + SDValue left1 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + shr1, mask1), 0); + + SDValue and1 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + Bits[0].getValue(), mask1), 0); + SDValue shl1Ops[] = {and1, getI32Imm(1, dl), getI32Imm(0, dl), + getI32Imm(30, dl)}; + SDValue right1 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shl1Ops), 0); + + SDValue v1 = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, left1, + right1), 0); + + // n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); + SDValue mask2 = SDValue(getInt32(CurDAG, dl, 0x33333333), 0); + SDValue shr2Ops[] = {v1, getI32Imm(30, dl), getI32Imm(2, dl), + getI32Imm(31, dl)}; + SDValue shr2 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shr2Ops), 0); + SDValue left2 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + shr2, mask2), 0); + + SDValue and2 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + v1, mask2), 0); + SDValue shl2Ops[] = {and2, getI32Imm(2, dl), getI32Imm(0, dl), + getI32Imm(29, dl)}; + SDValue right2 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shl2Ops), 0); + + SDValue v2 = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, left2, + right2), 0); + + // n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); + SDValue mask4 = SDValue(getInt32(CurDAG, dl, 0x0F0F0F0F), 0); + SDValue shr4Ops[] = {v2, getI32Imm(28, dl), getI32Imm(4, dl), + getI32Imm(31, dl)}; + SDValue shr4 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shr4Ops), 0); + SDValue left4 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + shr4, mask4), 0); + + SDValue and4 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + v2, mask4), 0); + SDValue shl4Ops[] = {and4, getI32Imm(4, dl), getI32Imm(0, dl), + getI32Imm(27, dl)}; + SDValue right4 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + shl4Ops), 0); + + SDValue v4 = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, left4, + right4), 0); + + // Byte swap. + // n = ((n rotl 8) & 0x00FF00FF) | ((n & 0x00FF00FF) rotr 8) + SDValue mask8 = SDValue(getInt32(CurDAG, dl, 0x00FF00FF), 0); + SDValue rotl8Ops[] = {v4, getI32Imm(8, dl), getI32Imm(8, dl), + getI32Imm(31, dl)}; + SDValue rotl8 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + rotl8Ops), 0); + SDValue left8 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + rotl8, mask8), 0); + + SDValue and8 = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, + v4, mask8), 0); + SDValue rotr8Ops[] = {and8, getI32Imm(8, dl), getI32Imm(0, dl), + getI32Imm(23, dl)}; + SDValue right8 = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + rotr8Ops), 0); + + SDValue result = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, + left8, right8), 0); + return result.getNode(); + } + + assert(Bits.size() == 64 && "Not 64 bits here?"); + // n = ((n >> 1) & 0x5555555555555555) | ((n & 0x5555555555555555) << 1); + SDValue mask1 = SDValue(getInt64(CurDAG, dl, 0x5555555555555555), 0); + SDValue shr1Ops[] = {Bits[0].getValue(), getI32Imm(63, dl), + getI32Imm(1, dl)}; + SDValue shr1 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + shr1Ops), 0); + SDValue left1 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + shr1, mask1), 0); + + SDValue and1 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + Bits[0].getValue(), mask1), 0); + SDValue shl1Ops[] = {and1, getI32Imm(1, dl), getI32Imm(62, dl)}; + SDValue right1 = SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + shl1Ops), 0); + + SDValue v1 = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, left1, + right1), 0); + + // n = ((n >> 2) & 0x3333333333333333) | ((n & 0x3333333333333333) << 2); + SDValue mask2 = SDValue(getInt64(CurDAG, dl, 0x3333333333333333), 0); + SDValue shr2Ops[] = {v1, getI32Imm(62, dl), getI32Imm(2, dl)}; + SDValue shr2 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + shr2Ops), 0); + SDValue left2 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + shr2, mask2), 0); + + SDValue and2 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, v1, + mask2), 0); + SDValue shl2Ops[] = {and2, getI32Imm(2, dl), getI32Imm(61, dl)}; + SDValue right2 = SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + shl2Ops), 0); + + SDValue v2 = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, left2, + right2), 0); + + // n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n & 0x0F0F0F0F0F0F0F0F) << 4); + SDValue mask4 = SDValue(getInt64(CurDAG, dl, 0x0F0F0F0F0F0F0F0F), 0); + SDValue shr4Ops[] = {v2, getI32Imm(60, dl), getI32Imm(4, dl)}; + SDValue shr4 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + shr4Ops), 0); + SDValue left4 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + shr4, mask4), 0); + + SDValue and4 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, v2, + mask4), 0); + SDValue shl4Ops[] = {and4, getI32Imm(4, dl), getI32Imm(59, dl)}; + SDValue right4 = SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + shl4Ops), 0); + + SDValue v4 = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, left4, + right4), 0); + + // Byte swap. + // n = ((n rotl 8) & 0xff000000ff) | ((n & 0xff000000ff) rotr 8) | + // ((n rotl 24) & 0xff000000ff00) | ((n & 0xff000000ff00) rotr 24) + SDValue mask8 = SDValue(getInt64(CurDAG, dl, 0xFF000000FF), 0); + SDValue rotl8Ops[] = {v4, getI32Imm(8, dl), getI32Imm(0, dl)}; + SDValue rotl8 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + rotl8Ops), 0); + SDValue part1 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + rotl8, mask8), 0); + + SDValue and8 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, v4, + mask8), 0); + SDValue rotr8Ops[] = {and8, getI32Imm(56, dl), getI32Imm(0, dl)}; + SDValue part2 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + rotr8Ops), 0); + + mask8 = SDValue(getInt64(CurDAG, dl, 0xFF000000FF00), 0); + SDValue rotl24Ops[] = {v4, getI32Imm(24, dl), getI32Imm(0, dl)}; + SDValue rotl24 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + rotl24Ops), 0); + SDValue part3 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + rotl24, mask8), 0); + + SDValue and24 = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + v4, mask8), 0); + SDValue rotr24Ops[] = {and24, getI32Imm(40, dl), getI32Imm(0, dl)}; + SDValue part4 = SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, + rotr24Ops), 0); + + SDValue part12 = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + part1, part2), 0); + SDValue part34 = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + part3, part4), 0); + SDValue result = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + part12, part34), 0); + return result.getNode(); + } + SmallVector Bits; bool HasZeros; @@ -2009,6 +2228,9 @@ " selection for: "); DEBUG(N->dump(CurDAG)); + if (isReverseBits()) + return SelectReverseBits(N); + // Fill it RLAmt and set HasZeros. computeRotationAmounts(); Index: test/CodeGen/PowerPC/pr33093.ll =================================================================== --- test/CodeGen/PowerPC/pr33093.ll +++ test/CodeGen/PowerPC/pr33093.ll @@ -0,0 +1,105 @@ +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s + +; Fast reverse bits algorithm +; n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); +; n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); +; n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); +; n = ((n rotl 8) & 0x00FF00FF) | ((n & 0x00FF00FF) rotr 8) // byte swap + +define zeroext i32 @ReverseBits(i32 zeroext %n) { +entry: + %shr = lshr i32 %n, 1 + %and = and i32 %shr, 1431655765 + %and1 = shl i32 %n, 1 + %shl = and i32 %and1, -1431655766 + %or = or i32 %and, %shl + %shr2 = lshr i32 %or, 2 + %and3 = and i32 %shr2, 858993459 + %and4 = shl i32 %or, 2 + %shl5 = and i32 %and4, -858993460 + %or6 = or i32 %and3, %shl5 + %shr7 = lshr i32 %or6, 4 + %and8 = and i32 %shr7, 252645135 + %and9 = shl i32 %or6, 4 + %shl10 = and i32 %and9, -252645136 + %or11 = or i32 %and8, %shl10 + %shr13 = lshr i32 %or11, 24 + %and14 = lshr i32 %or11, 8 + %shr15 = and i32 %and14, 65280 + %and17 = shl i32 %or11, 8 + %shl18 = and i32 %and17, 16711680 + %shl21 = shl i32 %or11, 24 + %or16 = or i32 %shl21, %shr13 + %or19 = or i32 %or16, %shr15 + %or22 = or i32 %or19, %shl18 + ret i32 %or22 +; CHECK-LABEL: @ReverseBits +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +} + +; Fast reverse bits algorithm +; n = ((n >> 1) & 0x5555555555555555) | ((n & 0x5555555555555555) << 1); +; n = ((n >> 2) & 0x3333333333333333) | ((n & 0x3333333333333333) << 2); +; n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n & 0x0F0F0F0F0F0F0F0F) << 4); +; n = ((n rotl 8) & 0xff000000ff) | ((n & 0xff000000ff) rotr 8) | +; ((n rotl 24) & 0xff000000ff00) | ((n & 0xff000000ff00) rotr 24) // byte swap + +define i64 @ReverseBits64(i64 %n) { +entry: + %shr = lshr i64 %n, 1 + %and = and i64 %shr, 6148914691236517205 + %and1 = shl i64 %n, 1 + %shl = and i64 %and1, -6148914691236517206 + %or = or i64 %and, %shl + %shr2 = lshr i64 %or, 2 + %and3 = and i64 %shr2, 3689348814741910323 + %and4 = shl i64 %or, 2 + %shl5 = and i64 %and4, -3689348814741910324 + %or6 = or i64 %and3, %shl5 + %shr7 = lshr i64 %or6, 4 + %and8 = and i64 %shr7, 1085102592571150095 + %and9 = shl i64 %or6, 4 + %shl10 = and i64 %and9, -1085102592571150096 + %or11 = or i64 %and8, %shl10 + %shr13 = lshr i64 %or11, 56 + %and14 = lshr i64 %or11, 40 + %shr15 = and i64 %and14, 65280 + %and17 = lshr i64 %or11, 24 + %shr18 = and i64 %and17, 16711680 + %and20 = lshr i64 %or11, 8 + %shr21 = and i64 %and20, 4278190080 + %and23 = shl i64 %or11, 8 + %shl24 = and i64 %and23, 1095216660480 + %and26 = shl i64 %or11, 24 + %shl27 = and i64 %and26, 280375465082880 + %and29 = shl i64 %or11, 40 + %shl30 = and i64 %and29, 71776119061217280 + %shl33 = shl i64 %or11, 56 + %or16 = or i64 %shl33, %shr13 + %or19 = or i64 %or16, %shr15 + %or22 = or i64 %or19, %shr18 + %or25 = or i64 %or22, %shr21 + %or28 = or i64 %or25, %shl24 + %or31 = or i64 %or28, %shl27 + %or34 = or i64 %or31, %shl30 + ret i64 %or34 +; CHECK-LABEL: @ReverseBits64 +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +; CHECK: and +} +