diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -88,6 +88,7 @@ GREVIW, GORCI, GORCIW, + SHFLI, // Vector Extension // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand // for the VL value to be used for the operation. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2353,6 +2353,20 @@ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes)); break; } + case RISCVISD::SHFLI: { + // There is no SHFLIW instruction, but we can just promote the operation. + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + SDLoc DL(N); + SDValue NewOp0 = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue NewRes = + DAG.getNode(RISCVISD::SHFLI, DL, MVT::i64, NewOp0, N->getOperand(1)); + // ReplaceNodeResults requires we maintain the same type for the return + // value. + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes)); + break; + } case ISD::BSWAP: case ISD::BITREVERSE: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && @@ -2487,7 +2501,8 @@ // [8] = 0x00FF00FF / 0xFF00FF00 // [16] = 0x0000FFFF / 0xFFFFFFFF // [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64) -static Optional matchRISCVBitmanipPat(SDValue Op) { +static Optional +matchRISCVBitmanipPat(SDValue Op, ArrayRef BitmanipMasks) { Optional Mask; // Optionally consume a mask around the shift operation. if (Op.getOpcode() == ISD::AND && isa(Op.getOperand(1))) { @@ -2500,26 +2515,17 @@ if (!isa(Op.getOperand(1))) return None; - auto ShAmt = Op.getConstantOperandVal(1); + uint64_t ShAmt = Op.getConstantOperandVal(1); - if (!isPowerOf2_64(ShAmt)) + unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32; + if (ShAmt >= Width && !isPowerOf2_64(ShAmt)) return None; - - // These are the unshifted masks which we use to match bit-manipulation - // patterns. They may be shifted left in certain circumstances. - static const uint64_t BitmanipMasks[] = { - 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, - 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL, - }; - - unsigned MaskIdx = Log2_64(ShAmt); - if (MaskIdx >= array_lengthof(BitmanipMasks)) + // If we don't have enough masks for 64 bit, then we must be trying to + // match SHFL so we're only allowed to shift 1/4 of the width. + if (BitmanipMasks.size() == 5 && ShAmt >= (Width / 2)) return None; - auto Src = Op.getOperand(0); - - unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32; - auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes(Width); + SDValue Src = Op.getOperand(0); // The expected mask is shifted left when the AND is found around SHL // patterns. @@ -2546,6 +2552,9 @@ } } + unsigned MaskIdx = Log2_32(ShAmt); + uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes(Width); + if (SHLExpMask) ExpMask <<= ShAmt; @@ -2555,15 +2564,27 @@ return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL}; } +static Optional matchGREVIPat(SDValue Op) { + // These are the unshifted masks which we use to match bit-manipulation + // patterns. They may be shifted left in certain circumstances. + static const uint64_t BitmanipMasks[] = { + 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL, + }; + + return matchRISCVBitmanipPat(Op, BitmanipMasks); +} + // Match the following pattern as a GREVI(W) operation // (or (BITMANIP_SHL x), (BITMANIP_SRL x)) static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { + assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson"); EVT VT = Op.getValueType(); if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) { - auto LHS = matchRISCVBitmanipPat(Op.getOperand(0)); - auto RHS = matchRISCVBitmanipPat(Op.getOperand(1)); + auto LHS = matchGREVIPat(Op.getOperand(0)); + auto RHS = matchGREVIPat(Op.getOperand(1)); if (LHS && RHS && LHS->formsPairWith(*RHS)) { SDLoc DL(Op); return DAG.getNode( @@ -2585,6 +2606,7 @@ // 4. (or (rotl/rotr x, bitwidth/2), x) static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { + assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson"); EVT VT = Op.getValueType(); if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) { @@ -2623,14 +2645,14 @@ return SDValue(); SDValue OrOp0 = Op0.getOperand(0); SDValue OrOp1 = Op0.getOperand(1); - auto LHS = matchRISCVBitmanipPat(OrOp0); + auto LHS = matchGREVIPat(OrOp0); // OR is commutable so swap the operands and try again: x might have been // on the left if (!LHS) { std::swap(OrOp0, OrOp1); - LHS = matchRISCVBitmanipPat(OrOp0); + LHS = matchGREVIPat(OrOp0); } - auto RHS = matchRISCVBitmanipPat(Op1); + auto RHS = matchGREVIPat(Op1); if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) { return DAG.getNode( RISCVISD::GORCI, DL, VT, LHS->Op, @@ -2640,6 +2662,93 @@ return SDValue(); } + +static Optional matchSHFLPat(SDValue Op) { + // These are the unshifted masks which we use to match bit-manipulation + // patterns. They may be shifted left in certain circumstances. + static const uint64_t BitmanipMasks[] = { + 0x2222222222222222ULL, 0x0C0C0C0C0C0C0C0CULL, 0x00F000F000F000F0ULL, + 0x0000FF000000FF00ULL, 0x00000000FFFF0000ULL + }; + + return matchRISCVBitmanipPat(Op, BitmanipMasks); +} + +// Match (or (or (SHFL_SHL x), (SHFL_SHR x)), (SHFL_AND x) +static SDValue combineORToSHFL(SDValue Op, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson"); + EVT VT = Op.getValueType(); + + if (VT != MVT::i32 && VT != Subtarget.getXLenVT()) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Or is commutable so canonicalize the second OR to the LHS. + if (Op0.getOpcode() != ISD::OR) + std::swap(Op0, Op1); + if (Op0.getOpcode() != ISD::OR) + return SDValue(); + + // We found an inner OR, so our operands are the operands of the inner OR + // and the other operand of the outer OR. + SDValue A = Op0.getOperand(0); + SDValue B = Op0.getOperand(1); + SDValue C = Op1; + + auto Match1 = matchSHFLPat(A); + auto Match2 = matchSHFLPat(B); + + // If neither matched, we failed. + if (!Match1 && !Match2) + return SDValue(); + + // We had at least one match. if one failed, try the remaining C operand. + if (!Match1) { + std::swap(A, C); + Match1 = matchSHFLPat(A); + if (!Match1) + return SDValue(); + } else if (!Match2) { + std::swap(B, C); + Match2 = matchSHFLPat(B); + if (!Match2) + return SDValue(); + } + assert(Match1 && Match2); + + // Make sure our matches pair up. + if (!Match1->formsPairWith(*Match2)) + return SDValue(); + + // All the remains is to make sure C is an AND with the same input, that masks + // out the bits that are being shuffled. + if (C.getOpcode() != ISD::AND || !isa(C.getOperand(1)) || + C.getOperand(0) != Match1->Op) + return SDValue(); + + uint64_t Mask = C.getConstantOperandVal(1); + + static const uint64_t BitmanipMasks[] = { + 0x9999999999999999ULL, 0xC3C3C3C3C3C3C3C3ULL, 0xF00FF00FF00FF00FULL, + 0xFF0000FFFF0000FFULL, 0xFFFF00000000FFFFULL, + }; + + unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32; + unsigned MaskIdx = Log2_32(Match1->ShAmt); + uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes(Width); + + if (Mask != ExpMask) + return SDValue(); + + SDLoc DL(Op); + return DAG.getNode( + RISCVISD::SHFLI, DL, VT, Match1->Op, + DAG.getTargetConstant(Match1->ShAmt, DL, Subtarget.getXLenVT())); +} + // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is // non-zero, and to x when it is. Any repeated GREVI stage undoes itself. // Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does @@ -2819,6 +2928,8 @@ return GREV; if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget)) return GORC; + if (auto SHFL = combineORToSHFL(SDValue(N, 0), DCI.DAG, Subtarget)) + return SHFL; break; case RISCVISD::SELECT_CC: { // Transform @@ -3042,6 +3153,19 @@ // more precise answer could be calculated for SRAW depending on known // bits in the shift amount. return 33; + case RISCVISD::SHFLI: { + // There is no SHFLIW, but a i64 SHFLI with bit 4 of the control word + // cleared doesn't affect bit 31. The upper 32 bits will be shuffled, but + // will stay within the upper 32 bits. If there were more than 32 sign bits + // before there will be at least 33 sign bits after. + if (Op.getValueType() == MVT::i64 && + (Op.getConstantOperandVal(1) & 0x10) == 0) { + unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + if (Tmp > 32) + return 33; + } + break; + } case RISCVISD::VMV_X_S: // The number of sign bits of the scalar result is computed by obtaining the // element type of the input vector operand, subtracting its width from the @@ -4705,6 +4829,7 @@ NODE_NAME_CASE(GREVIW) NODE_NAME_CASE(GORCI) NODE_NAME_CASE(GORCIW) + NODE_NAME_CASE(SHFLI) NODE_NAME_CASE(VMV_V_X_VL) NODE_NAME_CASE(VFMV_V_F_VL) NODE_NAME_CASE(VMV_X_S) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td @@ -671,8 +671,10 @@ def riscv_greviw : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>; def riscv_gorci : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>; def riscv_gorciw : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>; +def riscv_shfli : SDNode<"RISCVISD::SHFLI", SDTIntBinOp, []>; let Predicates = [HasStdExtZbp] in { +def : Pat<(riscv_shfli GPR:$rs1, timm:$shamt), (SHFLI GPR:$rs1, timm:$shamt)>; def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>; def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>; @@ -788,48 +790,6 @@ def : Pat<(and GPR:$rs, 0x000000000000FFFF), (ZEXTH_RV64 GPR:$rs)>; } -let Predicates = [HasStdExtZbp, IsRV32] in { -def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)), - (and GPR:$rs1, (i32 0xFF0000FF))), - (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))), - (SHFLI GPR:$rs1, (i32 8))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)), - (and GPR:$rs1, (i32 0xF00FF00F))), - (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))), - (SHFLI GPR:$rs1, (i32 4))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)), - (and GPR:$rs1, (i32 0xC3C3C3C3))), - (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))), - (SHFLI GPR:$rs1, (i32 2))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)), - (and GPR:$rs1, (i32 0x99999999))), - (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))), - (SHFLI GPR:$rs1, (i32 1))>; -} // Predicates = [HasStdExtZbp, IsRV32] - -let Predicates = [HasStdExtZbp, IsRV64] in { -def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)), - (and GPR:$rs1, (i64 0xFFFF00000000FFFF))), - (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))), - (SHFLI GPR:$rs1, (i64 16))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)), - (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))), - (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))), - (SHFLI GPR:$rs1, (i64 8))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)), - (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))), - (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))), - (SHFLI GPR:$rs1, (i64 4))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)), - (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))), - (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))), - (SHFLI GPR:$rs1, (i64 2))>; -def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)), - (and GPR:$rs1, (i64 0x9999999999999999))), - (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))), - (SHFLI GPR:$rs1, (i64 1))>; -} // Predicates = [HasStdExtZbp, IsRV64] - let Predicates = [HasStdExtZba] in { def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2), (SH1ADD GPR:$rs1, GPR:$rs2)>; diff --git a/llvm/test/CodeGen/RISCV/rv32Zbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbp.ll --- a/llvm/test/CodeGen/RISCV/rv32Zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32Zbp.ll @@ -2850,8 +2850,8 @@ ; RV32I-NEXT: addi a3, a3, 1092 ; RV32I-NEXT: and a5, a5, a3 ; RV32I-NEXT: and a3, a4, a3 -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: or a3, a5, a6 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: or a3, a6, a5 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: lui a4, 139810 @@ -2876,7 +2876,7 @@ %and = and i64 %a, -7378697629483820647 %shl = shl i64 %a, 1 %and1 = and i64 %shl, 4919131752989213764 - %or = or i64 %and1, %and + %or = or i64 %and, %and1 %shr = lshr i64 %a, 1 %and2 = and i64 %shr, 2459565876494606882 %or3 = or i64 %or, %and2 @@ -2898,7 +2898,7 @@ ; RV32I-NEXT: lui a2, 49345 ; RV32I-NEXT: addi a2, a2, -1012 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl2_i32: @@ -2916,7 +2916,7 @@ %or = or i32 %and1, %and %shr = lshr i32 %a, 2 %and2 = and i32 %shr, 202116108 - %or3 = or i32 %or, %and2 + %or3 = or i32 %and2, %or ret i32 %or3 } @@ -2933,16 +2933,16 @@ ; RV32I-NEXT: addi a3, a3, 48 ; RV32I-NEXT: and a5, a5, a3 ; RV32I-NEXT: and a3, a4, a3 -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: or a3, a5, a6 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: or a3, a6, a5 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: srli a1, a1, 2 ; RV32I-NEXT: lui a4, 49345 ; RV32I-NEXT: addi a4, a4, -1012 ; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl2_i64: @@ -2959,10 +2959,10 @@ %and = and i64 %a, -4340410370284600381 %shl = shl i64 %a, 2 %and1 = and i64 %shl, 3472328296227680304 - %or = or i64 %and1, %and + %or = or i64 %and, %and1 %shr = lshr i64 %a, 2 %and2 = and i64 %shr, 868082074056920076 - %or3 = or i64 %or, %and2 + %or3 = or i64 %and2, %or ret i64 %or3 } @@ -2976,12 +2976,12 @@ ; RV32I-NEXT: lui a3, 61441 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: lui a2, 3840 -; RV32I-NEXT: addi a2, a2, 240 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a3, 3840 +; RV32I-NEXT: addi a3, a3, 240 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl4_i32: @@ -2996,10 +2996,10 @@ %and = and i32 %a, -267390961 %shl = shl i32 %a, 4 %and1 = and i32 %shl, 251662080 - %or = or i32 %and1, %and %shr = lshr i32 %a, 4 %and2 = and i32 %shr, 15728880 - %or3 = or i32 %or, %and2 + %or = or i32 %and2, %and + %or3 = or i32 %or, %and1 ret i32 %or3 } @@ -3008,24 +3008,24 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 983295 ; RV32I-NEXT: addi a2, a2, 15 -; RV32I-NEXT: and a6, a0, a2 -; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: and a6, a1, a2 +; RV32I-NEXT: and a2, a0, a2 ; RV32I-NEXT: slli a4, a1, 4 ; RV32I-NEXT: slli a5, a0, 4 ; RV32I-NEXT: lui a3, 61441 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a5, a5, a3 ; RV32I-NEXT: and a3, a4, a3 -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: or a3, a5, a6 -; RV32I-NEXT: srli a0, a0, 4 ; RV32I-NEXT: srli a1, a1, 4 +; RV32I-NEXT: srli a0, a0, 4 ; RV32I-NEXT: lui a4, 3840 ; RV32I-NEXT: addi a4, a4, 240 -; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a0, a5, a0 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl4_i64: @@ -3042,10 +3042,10 @@ %and = and i64 %a, -1148435428713435121 %shl = shl i64 %a, 4 %and1 = and i64 %shl, 1080880403494997760 - %or = or i64 %and1, %and %shr = lshr i64 %a, 4 %and2 = and i64 %shr, 67555025218437360 - %or3 = or i64 %or, %and2 + %or = or i64 %and1, %and2 + %or3 = or i64 %or, %and ret i64 %or3 } @@ -3058,12 +3058,12 @@ ; RV32I-NEXT: slli a2, a0, 8 ; RV32I-NEXT: lui a3, 4080 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -256 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: addi a3, a3, -256 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl8_i32: @@ -3078,10 +3078,10 @@ %and = and i32 %a, -16776961 %shl = shl i32 %a, 8 %and1 = and i32 %shl, 16711680 - %or = or i32 %and1, %and %shr = lshr i32 %a, 8 %and2 = and i32 %shr, 65280 - %or3 = or i32 %or, %and2 + %or = or i32 %and, %and2 + %or3 = or i32 %or, %and1 ret i32 %or3 } @@ -3090,23 +3090,23 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 1044480 ; RV32I-NEXT: addi a2, a2, 255 -; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: and a6, a0, a2 ; RV32I-NEXT: and a2, a1, a2 -; RV32I-NEXT: slli a4, a1, 8 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: lui a6, 4080 -; RV32I-NEXT: and a5, a5, a6 -; RV32I-NEXT: and a4, a4, a6 -; RV32I-NEXT: or a2, a4, a2 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: slli a4, a0, 8 +; RV32I-NEXT: slli a5, a1, 8 +; RV32I-NEXT: lui a3, 4080 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: and a3, a4, a3 ; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: lui a4, 16 ; RV32I-NEXT: addi a4, a4, -256 -; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: ret ; ; RV32IB-LABEL: shfl8_i64: @@ -3123,10 +3123,10 @@ %and = and i64 %a, -72056494543077121 %shl = shl i64 %a, 8 %and1 = and i64 %shl, 71776119077928960 - %or = or i64 %and1, %and %shr = lshr i64 %a, 8 %and2 = and i64 %shr, 280375465148160 - %or3 = or i64 %or, %and2 + %or = or i64 %and2, %and + %or3 = or i64 %and1, %or ret i64 %or3 } diff --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64Zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll @@ -3430,8 +3430,42 @@ ret i64 %2 } -; There's no [un]shfliw instruction as slliu.w occupies the encoding slot that -; would be occupied by shfliw. +define signext i32 @shfl1_i32(i32 signext %a, i32 signext %b) nounwind { +; RV64I-LABEL: shfl1_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 629146 +; RV64I-NEXT: addiw a1, a1, -1639 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a0, 1 +; RV64I-NEXT: lui a3, 279620 +; RV64I-NEXT: addiw a3, a3, 1092 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a2, 139810 +; RV64I-NEXT: addiw a2, a2, 546 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64IB-LABEL: shfl1_i32: +; RV64IB: # %bb.0: +; RV64IB-NEXT: zip.n a0, a0 +; RV64IB-NEXT: ret +; +; RV64IBP-LABEL: shfl1_i32: +; RV64IBP: # %bb.0: +; RV64IBP-NEXT: zip.n a0, a0 +; RV64IBP-NEXT: ret + %and = and i32 %a, -1717986919 + %shl = shl i32 %a, 1 + %and1 = and i32 %shl, 1145324612 + %or = or i32 %and1, %and + %shr = lshr i32 %a, 1 + %and2 = and i32 %shr, 572662306 + %or3 = or i32 %or, %and2 + ret i32 %or3 +} define i64 @shfl1_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl1_i64: @@ -3455,7 +3489,7 @@ ; RV64I-NEXT: slli a4, a3, 14 ; RV64I-NEXT: addi a4, a4, 1092 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: slli a2, a3, 13 ; RV64I-NEXT: addi a2, a2, 546 @@ -3475,13 +3509,50 @@ %and = and i64 %a, -7378697629483820647 %shl = shl i64 %a, 1 %and1 = and i64 %shl, 4919131752989213764 - %or = or i64 %and1, %and + %or = or i64 %and, %and1 %shr = lshr i64 %a, 1 %and2 = and i64 %shr, 2459565876494606882 %or3 = or i64 %or, %and2 ret i64 %or3 } +define signext i32 @shfl2_i32(i32 signext %a, i32 signext %b) nounwind { +; RV64I-LABEL: shfl2_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 801852 +; RV64I-NEXT: addiw a1, a1, 963 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a0, 2 +; RV64I-NEXT: lui a3, 197379 +; RV64I-NEXT: addiw a3, a3, 48 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: lui a2, 49345 +; RV64I-NEXT: addiw a2, a2, -1012 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64IB-LABEL: shfl2_i32: +; RV64IB: # %bb.0: +; RV64IB-NEXT: zip2.b a0, a0 +; RV64IB-NEXT: ret +; +; RV64IBP-LABEL: shfl2_i32: +; RV64IBP: # %bb.0: +; RV64IBP-NEXT: zip2.b a0, a0 +; RV64IBP-NEXT: ret + %and = and i32 %a, -1010580541 + %shl = shl i32 %a, 2 + %and1 = and i32 %shl, 808464432 + %or = or i32 %and1, %and + %shr = lshr i32 %a, 2 + %and2 = and i32 %shr, 202116108 + %or3 = or i32 %and2, %or + ret i32 %or3 +} + define i64 @shfl2_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl2_i64: ; RV64I: # %bb.0: @@ -3504,14 +3575,14 @@ ; RV64I-NEXT: slli a4, a4, 12 ; RV64I-NEXT: addi a4, a4, 48 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: slli a2, a3, 14 ; RV64I-NEXT: addi a2, a2, 193 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, -1012 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: shfl2_i64: @@ -3526,13 +3597,50 @@ %and = and i64 %a, -4340410370284600381 %shl = shl i64 %a, 2 %and1 = and i64 %shl, 3472328296227680304 - %or = or i64 %and1, %and + %or = or i64 %and, %and1 %shr = lshr i64 %a, 2 %and2 = and i64 %shr, 868082074056920076 - %or3 = or i64 %or, %and2 + %or3 = or i64 %and2, %or ret i64 %or3 } +define signext i32 @shfl4_i32(i32 signext %a, i32 signext %b) nounwind { +; RV64I-LABEL: shfl4_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 983295 +; RV64I-NEXT: addiw a1, a1, 15 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a0, 4 +; RV64I-NEXT: lui a3, 61441 +; RV64I-NEXT: addiw a3, a3, -256 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a0, a0, 4 +; RV64I-NEXT: lui a3, 3840 +; RV64I-NEXT: addiw a3, a3, 240 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: ret +; +; RV64IB-LABEL: shfl4_i32: +; RV64IB: # %bb.0: +; RV64IB-NEXT: zip4.h a0, a0 +; RV64IB-NEXT: ret +; +; RV64IBP-LABEL: shfl4_i32: +; RV64IBP: # %bb.0: +; RV64IBP-NEXT: zip4.h a0, a0 +; RV64IBP-NEXT: ret + %and = and i32 %a, -267390961 + %shl = shl i32 %a, 4 + %and1 = and i32 %shl, 251662080 + %shr = lshr i32 %a, 4 + %and2 = and i32 %shr, 15728880 + %or = or i32 %and2, %and + %or3 = or i32 %or, %and1 + ret i32 %or3 +} + define i64 @shfl4_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl4_i64: ; RV64I: # %bb.0: @@ -3555,12 +3663,12 @@ ; RV64I-NEXT: slli a4, a4, 12 ; RV64I-NEXT: addi a4, a4, -256 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: slli a2, a3, 20 -; RV64I-NEXT: addi a2, a2, 240 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: slli a3, a3, 20 +; RV64I-NEXT: addi a3, a3, 240 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: shfl4_i64: @@ -3575,13 +3683,49 @@ %and = and i64 %a, -1148435428713435121 %shl = shl i64 %a, 4 %and1 = and i64 %shl, 1080880403494997760 - %or = or i64 %and1, %and %shr = lshr i64 %a, 4 %and2 = and i64 %shr, 67555025218437360 - %or3 = or i64 %or, %and2 + %or = or i64 %and1, %and2 + %or3 = or i64 %or, %and ret i64 %or3 } +define signext i32 @shfl8_i32(i32 signext %a, i32 signext %b) nounwind { +; RV64I-LABEL: shfl8_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 1044480 +; RV64I-NEXT: addiw a1, a1, 255 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: lui a3, 4080 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: lui a3, 16 +; RV64I-NEXT: addiw a3, a3, -256 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: ret +; +; RV64IB-LABEL: shfl8_i32: +; RV64IB: # %bb.0: +; RV64IB-NEXT: zip8.w a0, a0 +; RV64IB-NEXT: ret +; +; RV64IBP-LABEL: shfl8_i32: +; RV64IBP: # %bb.0: +; RV64IBP-NEXT: zip8.w a0, a0 +; RV64IBP-NEXT: ret + %and = and i32 %a, -16776961 + %shl = shl i32 %a, 8 + %and1 = and i32 %shl, 16711680 + %shr = lshr i32 %a, 8 + %and2 = and i32 %shr, 65280 + %or = or i32 %and, %and2 + %or3 = or i32 %or, %and1 + ret i32 %or3 +} + define i64 @shfl8_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl8_i64: ; RV64I: # %bb.0: @@ -3598,14 +3742,14 @@ ; RV64I-NEXT: addi a4, a4, 255 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: srli a0, a0, 8 -; RV64I-NEXT: slli a2, a3, 24 -; RV64I-NEXT: addi a2, a2, 1 -; RV64I-NEXT: slli a2, a2, 16 -; RV64I-NEXT: addi a2, a2, -256 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: addi a3, a3, 1 +; RV64I-NEXT: slli a3, a3, 16 +; RV64I-NEXT: addi a3, a3, -256 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret ; ; RV64IB-LABEL: shfl8_i64: @@ -3620,10 +3764,10 @@ %and = and i64 %a, -72056494543077121 %shl = shl i64 %a, 8 %and1 = and i64 %shl, 71776119077928960 - %or = or i64 %and1, %and %shr = lshr i64 %a, 8 %and2 = and i64 %shr, 280375465148160 - %or3 = or i64 %or, %and2 + %or = or i64 %and2, %and + %or3 = or i64 %and1, %or ret i64 %or3 }