diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4410,6 +4410,12 @@ /// \returns The expansion result or SDValue() if it fails. SDValue expandBSWAP(SDNode *N, SelectionDAG &DAG) const; + /// Expand BITREVERSE nodes. Expands scalar/vector BITREVERSE nodes. + /// Returns SDValue() if expand fails. + /// \param N Node to expand + /// \returns The expansion result or SDValue() if it fails. + SDValue expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const; + /// Turn load of vector type into a load of the individual elements. /// \param LD load to expand /// \returns BUILD_VECTOR and TokenFactor nodes. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -181,7 +181,6 @@ SmallVectorImpl &Results); SDValue PromoteLegalFP_TO_INT_SAT(SDNode *Node, const SDLoc &dl); - SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl); SDValue ExpandPARITY(SDValue Op, const SDLoc &dl); SDValue ExpandExtractFromVectorThroughStack(SDValue Op); @@ -2781,70 +2780,6 @@ return DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Result); } -/// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts. -SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { - EVT VT = Op.getValueType(); - EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - unsigned Sz = VT.getScalarSizeInBits(); - - SDValue Tmp, Tmp2, Tmp3; - - // If we can, perform BSWAP first and then the mask+swap the i4, then i2 - // and finally the i1 pairs. - // TODO: We can easily support i4/i2 legal types if any target ever does. - if (Sz >= 8 && isPowerOf2_32(Sz)) { - // Create the masks - repeating the pattern every byte. - APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0)); - APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC)); - APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA)); - APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F)); - APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33)); - APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55)); - - // BSWAP if the type is wider than a single byte. - Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op); - - // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT)); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT)); - Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); - - // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT)); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT)); - Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); - - // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT)); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT)); - Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); - return Tmp; - } - - Tmp = DAG.getConstant(0, dl, VT); - for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) { - if (I < J) - Tmp2 = - DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT)); - else - Tmp2 = - DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); - - APInt Shift(Sz, 1); - Shift <<= J; - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); - Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); - } - - return Tmp; -} - /// Open code the operations for PARITY of the specified operation. SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) { EVT VT = Op.getValueType(); @@ -2893,7 +2828,8 @@ Results.push_back(Tmp1); break; case ISD::BITREVERSE: - Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl)); + if ((Tmp1 = TLI.expandBITREVERSE(Node, DAG))) + Results.push_back(Tmp1); break; case ISD::BSWAP: if ((Tmp1 = TLI.expandBSWAP(Node, DAG))) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -478,6 +478,16 @@ EVT NVT = Op.getValueType(); SDLoc dl(N); + // If the larger BITREVERSE isn't supported by the target, try to expand now. + // If we expand later we'll end up with more operations since we lost the + // original type. We only do this for scalars since we have a shuffle + // based lowering for vectors in LegalizeVectorOps. + if (!OVT.isVector() && OVT.isSimple() && + !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, NVT)) { + if (SDValue Res = TLI.expandBITREVERSE(N, DAG)) + return DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Res); + } + unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); EVT ShiftVT = getShiftAmountTyForConstant(NVT, TLI, DAG); return DAG.getNode(ISD::SRL, dl, NVT, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7115,6 +7115,71 @@ } } +SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout()); + unsigned Sz = VT.getScalarSizeInBits(); + + SDValue Tmp, Tmp2, Tmp3; + + // If we can, perform BSWAP first and then the mask+swap the i4, then i2 + // and finally the i1 pairs. + // TODO: We can easily support i4/i2 legal types if any target ever does. + if (Sz >= 8 && isPowerOf2_32(Sz)) { + // Create the masks - repeating the pattern every byte. + APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0)); + APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC)); + APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA)); + APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F)); + APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33)); + APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55)); + + // BSWAP if the type is wider than a single byte. + Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op); + + // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4) + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT)); + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT)); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT)); + Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + + // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2) + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT)); + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT)); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT)); + Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + + // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1) + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT)); + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT)); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT)); + Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + return Tmp; + } + + Tmp = DAG.getConstant(0, dl, VT); + for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) { + if (I < J) + Tmp2 = + DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT)); + else + Tmp2 = + DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); + + APInt Shift(Sz, 1); + Shift <<= J; + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); + Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); + } + + return Tmp; +} + std::pair TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/RISCV/rv32Zbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbp.ll --- a/llvm/test/CodeGen/RISCV/rv32Zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32Zbp.ll @@ -2449,26 +2449,20 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind { ; RV32I-LABEL: bitreverse_i8: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a1, a0, 20 -; RV32I-NEXT: lui a2, 61440 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a0, a0, 28 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: andi a0, a0, 15 +; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: lui a1, 208896 -; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: andi a1, a0, 51 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: lui a2, 835584 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: andi a0, a0, 204 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 348160 -; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: andi a1, a0, 85 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: lui a2, 696320 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: andi a0, a0, 170 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: bitreverse_i8: @@ -2491,33 +2485,36 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind { ; RV32I-LABEL: bitreverse_i16: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: lui a2, 4080 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srli a1, a0, 8 +; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 61680 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a1, a0, a1 ; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: lui a2, 986880 +; RV32I-NEXT: lui a2, 15 +; RV32I-NEXT: addi a2, a2, 240 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: srli a0, a0, 4 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 209712 +; RV32I-NEXT: lui a1, 3 +; RV32I-NEXT: addi a1, a1, 819 ; RV32I-NEXT: and a1, a0, a1 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: lui a2, 838848 +; RV32I-NEXT: lui a2, 13 +; RV32I-NEXT: addi a2, a2, -820 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 349520 +; RV32I-NEXT: lui a1, 5 +; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: and a1, a0, a1 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: lui a2, 699040 +; RV32I-NEXT: lui a2, 11 +; RV32I-NEXT: addi a2, a2, -1366 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 16 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: bitreverse_i16: diff --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64Zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll @@ -2820,31 +2820,20 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind { ; RV64I-LABEL: bitreverse_i8: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a1, a0, 52 -; RV64I-NEXT: addi a2, zero, 15 -; RV64I-NEXT: slli a2, a2, 56 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: slli a0, a0, 60 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: andi a0, a0, 15 +; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: addi a1, zero, 51 -; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: andi a1, a0, 51 ; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: addi a2, zero, -13 -; RV64I-NEXT: slli a2, a2, 58 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: andi a0, a0, 204 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addi a1, zero, 85 -; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: andi a1, a0, 85 ; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: addi a2, zero, -43 -; RV64I-NEXT: slli a2, a2, 57 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: andi a0, a0, 170 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: bitreverse_i8: @@ -2867,45 +2856,36 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind { ; RV64I-LABEL: bitreverse_i16: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a1, a0, 40 -; RV64I-NEXT: addi a2, zero, 255 -; RV64I-NEXT: slli a2, a2, 48 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: lui a1, 1 ; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 48 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: addi a2, zero, -241 -; RV64I-NEXT: slli a2, a2, 52 +; RV64I-NEXT: lui a2, 15 +; RV64I-NEXT: addiw a2, a2, 240 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: lui a1, 3 ; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 48 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 1048575 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 50 +; RV64I-NEXT: lui a2, 13 +; RV64I-NEXT: addiw a2, a2, -820 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: lui a1, 5 ; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 48 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 1048573 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 49 +; RV64I-NEXT: lui a2, 11 +; RV64I-NEXT: addiw a2, a2, -1366 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: bitreverse_i16: @@ -2928,89 +2908,48 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bitreverse_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: lui a2, 4080 +; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a2, a0, 8 -; RV64I-NEXT: addi a3, zero, 255 -; RV64I-NEXT: slli a4, a3, 24 -; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a1, a2, a1 -; RV64I-NEXT: srli a2, a0, 40 -; RV64I-NEXT: lui a4, 16 -; RV64I-NEXT: addiw a4, a4, -256 -; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: srli a4, a0, 56 -; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: srliw a2, a0, 24 ; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 8 -; RV64I-NEXT: slli a4, a3, 32 -; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: slli a4, a0, 24 -; RV64I-NEXT: slli a5, a3, 40 -; RV64I-NEXT: and a4, a4, a5 -; RV64I-NEXT: or a2, a4, a2 -; RV64I-NEXT: slli a4, a0, 40 -; RV64I-NEXT: slli a3, a3, 48 -; RV64I-NEXT: and a3, a4, a3 -; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: lui a3, 4080 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addiw a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a2, 1044721 +; RV64I-NEXT: lui a2, 241 ; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 240 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 13107 +; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 1035469 -; RV64I-NEXT: addiw a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -820 +; RV64I-NEXT: lui a2, 838861 +; RV64I-NEXT: addiw a2, a2, -820 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: lui a1, 349525 ; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 873813 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 33 +; RV64I-NEXT: lui a2, 699051 +; RV64I-NEXT: addiw a2, a2, -1366 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srai a0, a0, 32 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: bitreverse_i32: @@ -3030,89 +2969,47 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bitreverse_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a2, a0, 24 -; RV64I-NEXT: lui a3, 4080 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: lui a3, 16 +; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: addi a4, zero, 255 -; RV64I-NEXT: slli a5, a4, 24 -; RV64I-NEXT: and a3, a3, a5 -; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: srli a3, a0, 40 -; RV64I-NEXT: lui a5, 16 -; RV64I-NEXT: addiw a5, a5, -256 -; RV64I-NEXT: and a3, a3, a5 -; RV64I-NEXT: srli a5, a0, 56 -; RV64I-NEXT: or a3, a3, a5 -; RV64I-NEXT: or a6, a2, a3 -; RV64I-NEXT: slli a3, a0, 8 -; RV64I-NEXT: slli a5, a4, 32 -; RV64I-NEXT: and a3, a3, a5 -; RV64I-NEXT: slli a5, a0, 24 -; RV64I-NEXT: slli a2, a4, 40 -; RV64I-NEXT: and a2, a5, a2 +; RV64I-NEXT: srliw a3, a0, 24 ; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: slli a3, a0, 40 -; RV64I-NEXT: slli a4, a4, 48 +; RV64I-NEXT: slli a3, a0, 8 +; RV64I-NEXT: lui a4, 4080 ; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lui a2, 3855 -; RV64I-NEXT: addiw a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 4 -; RV64I-NEXT: lui a3, 1044721 +; RV64I-NEXT: lui a3, 241 ; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, 241 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -241 -; RV64I-NEXT: slli a3, a3, 12 ; RV64I-NEXT: addi a3, a3, 240 ; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 13107 +; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: lui a3, 1035469 -; RV64I-NEXT: addiw a3, a3, -819 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -819 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -819 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -820 +; RV64I-NEXT: lui a3, 838861 +; RV64I-NEXT: addiw a3, a3, -820 ; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 32 ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 1 -; RV64I-NEXT: lui a3, 873813 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: slli a3, a3, 33 +; RV64I-NEXT: lui a3, 699051 +; RV64I-NEXT: addiw a3, a3, -1366 ; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sw a0, 0(a1) ; RV64I-NEXT: ret ; @@ -3315,109 +3212,56 @@ define i32 @bitreverse_bswap_i32(i32 %a) { ; RV64I-LABEL: bitreverse_bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a2, a0, 24 -; RV64I-NEXT: lui a6, 4080 -; RV64I-NEXT: and a2, a2, a6 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: addi a4, zero, 255 -; RV64I-NEXT: slli a5, a4, 24 -; RV64I-NEXT: and a3, a3, a5 -; RV64I-NEXT: or a3, a3, a2 -; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srliw a1, a0, 8 ; RV64I-NEXT: lui a2, 16 -; RV64I-NEXT: addiw a7, a2, -256 -; RV64I-NEXT: and a5, a5, a7 -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: or a1, a5, a1 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: addiw a2, a2, -256 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srliw a3, a0, 24 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a3, a0, 8 -; RV64I-NEXT: slli a5, a4, 32 -; RV64I-NEXT: and a3, a3, a5 -; RV64I-NEXT: slli a5, a0, 24 -; RV64I-NEXT: slli a2, a4, 40 -; RV64I-NEXT: and a2, a5, a2 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: slli a3, a0, 40 -; RV64I-NEXT: slli a4, a4, 48 +; RV64I-NEXT: lui a4, 4080 ; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addiw a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a2, 1044721 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 240 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a3, 241 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: slli a3, a3, 12 +; RV64I-NEXT: addi a3, a3, 240 +; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 13107 +; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 1035469 -; RV64I-NEXT: addiw a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -820 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a3, 838861 +; RV64I-NEXT: addiw a3, a3, -820 +; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 21845 +; RV64I-NEXT: lui a1, 349525 ; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 1026731 -; RV64I-NEXT: addiw a2, a2, -1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1366 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a3, 699051 +; RV64I-NEXT: addiw a3, a3, -1366 +; RV64I-NEXT: and a0, a0, a3 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: lui a2, 1044480 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a2, a0, 40 -; RV64I-NEXT: and a2, a2, a7 -; RV64I-NEXT: srli a3, a0, 56 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: srli a0, a0, 24 -; RV64I-NEXT: and a0, a0, a6 -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a2, a0, 24 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: bitreverse_bswap_i32: