diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7346,45 +7346,57 @@ } // Combine -// ROTR ((GREV x, 24), 16) -> (GREVI x, 8) -// ROTL ((GREV x, 24), 16) -> (GREVI x, 8) -// RORW ((GREVW x, 24), 16) -> (GREVIW x, 8) -// ROLW ((GREVW x, 24), 16) -> (GREVIW x, 8) +// ROTR ((GREV x, 24), 16) -> (GREVI x, 8) for RV32 +// ROTL ((GREV x, 24), 16) -> (GREVI x, 8) for RV32 +// ROTR ((GREV x, 56), 32) -> (GREVI x, 24) for RV64 +// ROTL ((GREV x, 56), 32) -> (GREVI x, 24) for RV64 +// RORW ((GREVW x, 24), 16) -> (GREVIW x, 8) for RV64 +// ROLW ((GREVW x, 24), 16) -> (GREVIW x, 8) for RV64 +// The grev patterns represents BSWAP. +// FIXME: This can be generalized to any GREV. We just need to toggle the MSB +// off the grev. static SDValue combineROTR_ROTL_RORW_ROLW(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { + bool IsWInstruction = + N->getOpcode() == RISCVISD::RORW || N->getOpcode() == RISCVISD::ROLW; assert((N->getOpcode() == ISD::ROTR || N->getOpcode() == ISD::ROTL || - N->getOpcode() == RISCVISD::RORW || - N->getOpcode() == RISCVISD::ROLW) && + IsWInstruction) && "Unexpected opcode!"); SDValue Src = N->getOperand(0); + EVT VT = N->getValueType(0); SDLoc DL(N); - unsigned Opc; if (!Subtarget.hasStdExtZbp()) return SDValue(); - if ((N->getOpcode() == ISD::ROTR || N->getOpcode() == ISD::ROTL) && - Src.getOpcode() == RISCVISD::GREV) - Opc = RISCVISD::GREV; - else if ((N->getOpcode() == RISCVISD::RORW || - N->getOpcode() == RISCVISD::ROLW) && - Src.getOpcode() == RISCVISD::GREVW) - Opc = RISCVISD::GREVW; - else + unsigned GrevOpc = IsWInstruction ? RISCVISD::GREVW : RISCVISD::GREV; + if (Src.getOpcode() != GrevOpc) return SDValue(); if (!isa(N->getOperand(1)) || !isa(Src.getOperand(1))) return SDValue(); + unsigned BitWidth = IsWInstruction ? 32 : VT.getSizeInBits(); + assert(isPowerOf2_32(BitWidth) && "Expected a power of 2"); + + // Needs to be a rotate by half the bitwidth for ROTR/ROTL or by 16 for + // RORW/ROLW. And the grev should be the encoding for bswap for this width. unsigned ShAmt1 = N->getConstantOperandVal(1); unsigned ShAmt2 = Src.getConstantOperandVal(1); - if (ShAmt1 != 16 && ShAmt2 != 24) + if (BitWidth < 16 || ShAmt1 != (BitWidth / 2) || ShAmt2 != (BitWidth - 8)) return SDValue(); Src = Src.getOperand(0); - return DAG.getNode(Opc, DL, N->getValueType(0), Src, - DAG.getConstant(8, DL, N->getOperand(1).getValueType())); + + // Toggle bit the MSB of the shift. + unsigned CombinedShAmt = ShAmt1 ^ ShAmt2; + if (CombinedShAmt == 0) + return Src; + + return DAG.getNode( + GrevOpc, DL, VT, Src, + DAG.getConstant(CombinedShAmt, DL, N->getOperand(1).getValueType())); } // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll --- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll @@ -340,11 +340,13 @@ ret i64 %tmp } -; FIXME: This is miscompiled. We can't fold the rotate with the grev. +; Make sure we don't fold this rotate with the grev. We can only fold a rotate +; by 32. define i64 @grevi64_24_rotl_16(i64 %a) nounwind { ; RV64ZBP-LABEL: grevi64_24_rotl_16: ; RV64ZBP: # %bb.0: -; RV64ZBP-NEXT: rev8.h a0, a0 +; RV64ZBP-NEXT: rev8.w a0, a0 +; RV64ZBP-NEXT: rori a0, a0, 48 ; RV64ZBP-NEXT: ret %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 24) %tmp1 = call i64 @llvm.fshl.i64(i64 %tmp, i64 %tmp, i64 16) @@ -352,11 +354,13 @@ } declare i64 @llvm.fshl.i64(i64, i64, i64) -; FIXME: This is miscompiled. We can't fold the rotate with the grev. +; Make sure we don't fold this rotate with the grev. We can only fold a rotate +; by 32. define i64 @grevi64_24_rotr_16(i64 %a) nounwind { ; RV64ZBP-LABEL: grevi64_24_rotr_16: ; RV64ZBP: # %bb.0: -; RV64ZBP-NEXT: rev8.h a0, a0 +; RV64ZBP-NEXT: rev8.w a0, a0 +; RV64ZBP-NEXT: rori a0, a0, 16 ; RV64ZBP-NEXT: ret %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 24) %tmp1 = call i64 @llvm.fshr.i64(i64 %tmp, i64 %tmp, i64 16) diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll @@ -2745,6 +2745,96 @@ ret i32 %2 } +define i64 @bswap_rotr_i64(i64 %a) { +; RV64I-LABEL: bswap_rotr_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: lui a2, 4080 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 8 +; RV64I-NEXT: li a3, 255 +; RV64I-NEXT: slli a4, a3, 24 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: srli a2, a0, 40 +; RV64I-NEXT: lui a4, 16 +; RV64I-NEXT: addiw a4, a4, -256 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: slli a4, a3, 40 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 40 +; RV64I-NEXT: slli a3, a3, 48 +; RV64I-NEXT: and a3, a4, a3 +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBP-LABEL: bswap_rotr_i64: +; RV64ZBP: # %bb.0: +; RV64ZBP-NEXT: rev8.w a0, a0 +; RV64ZBP-NEXT: ret + %1 = call i64 @llvm.bswap.i64(i64 %a) + %2 = call i64 @llvm.fshr.i64(i64 %1, i64 %1, i64 32) + ret i64 %2 +} + +define i64 @bswap_rotl_i64(i64 %a) { +; RV64I-LABEL: bswap_rotl_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: lui a2, 4080 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 8 +; RV64I-NEXT: li a3, 255 +; RV64I-NEXT: slli a4, a3, 24 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: srli a2, a0, 40 +; RV64I-NEXT: lui a4, 16 +; RV64I-NEXT: addiw a4, a4, -256 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: slli a4, a3, 40 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 40 +; RV64I-NEXT: slli a3, a3, 48 +; RV64I-NEXT: and a3, a4, a3 +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBP-LABEL: bswap_rotl_i64: +; RV64ZBP: # %bb.0: +; RV64ZBP-NEXT: rev8.w a0, a0 +; RV64ZBP-NEXT: ret + %1 = call i64 @llvm.bswap.i64(i64 %a) + %2 = call i64 @llvm.fshl.i64(i64 %1, i64 %1, i64 32) + ret i64 %2 +} + define i32 @bitreverse_bswap_i32(i32 %a) { ; RV64I-LABEL: bitreverse_bswap_i32: ; RV64I: # %bb.0: @@ -2783,20 +2873,20 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV64I-LABEL: bitreverse_bswap_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, %hi(.LCPI76_0) -; RV64I-NEXT: ld a1, %lo(.LCPI76_0)(a1) +; RV64I-NEXT: lui a1, %hi(.LCPI78_0) +; RV64I-NEXT: ld a1, %lo(.LCPI78_0)(a1) ; RV64I-NEXT: srli a2, a0, 4 ; RV64I-NEXT: and a2, a2, a1 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI76_1) -; RV64I-NEXT: ld a1, %lo(.LCPI76_1)(a1) +; RV64I-NEXT: lui a1, %hi(.LCPI78_1) +; RV64I-NEXT: ld a1, %lo(.LCPI78_1)(a1) ; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: srli a2, a0, 2 ; RV64I-NEXT: and a2, a2, a1 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI76_2) -; RV64I-NEXT: ld a1, %lo(.LCPI76_2)(a1) +; RV64I-NEXT: lui a1, %hi(.LCPI78_2) +; RV64I-NEXT: ld a1, %lo(.LCPI78_2)(a1) ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: srli a2, a0, 1 @@ -2850,14 +2940,14 @@ define i64 @shfl1_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl1_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, %hi(.LCPI78_1) -; RV64I-NEXT: ld a1, %lo(.LCPI78_1)(a1) -; RV64I-NEXT: lui a2, %hi(.LCPI78_0) -; RV64I-NEXT: ld a2, %lo(.LCPI78_0)(a2) +; RV64I-NEXT: lui a1, %hi(.LCPI80_1) +; RV64I-NEXT: ld a1, %lo(.LCPI80_1)(a1) +; RV64I-NEXT: lui a2, %hi(.LCPI80_0) +; RV64I-NEXT: ld a2, %lo(.LCPI80_0)(a2) ; RV64I-NEXT: slli a3, a0, 1 ; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, %hi(.LCPI78_2) -; RV64I-NEXT: ld a3, %lo(.LCPI78_2)(a3) +; RV64I-NEXT: lui a3, %hi(.LCPI80_2) +; RV64I-NEXT: ld a3, %lo(.LCPI80_2)(a3) ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: srli a0, a0, 1 @@ -2914,14 +3004,14 @@ define i64 @shfl2_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl2_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, %hi(.LCPI80_1) -; RV64I-NEXT: ld a1, %lo(.LCPI80_1)(a1) -; RV64I-NEXT: lui a2, %hi(.LCPI80_0) -; RV64I-NEXT: ld a2, %lo(.LCPI80_0)(a2) +; RV64I-NEXT: lui a1, %hi(.LCPI82_1) +; RV64I-NEXT: ld a1, %lo(.LCPI82_1)(a1) +; RV64I-NEXT: lui a2, %hi(.LCPI82_0) +; RV64I-NEXT: ld a2, %lo(.LCPI82_0)(a2) ; RV64I-NEXT: slli a3, a0, 2 ; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, %hi(.LCPI80_2) -; RV64I-NEXT: ld a3, %lo(.LCPI80_2)(a3) +; RV64I-NEXT: lui a3, %hi(.LCPI82_2) +; RV64I-NEXT: ld a3, %lo(.LCPI82_2)(a3) ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: srli a0, a0, 2 @@ -2978,13 +3068,13 @@ define i64 @shfl4_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: shfl4_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, %hi(.LCPI82_0) -; RV64I-NEXT: ld a1, %lo(.LCPI82_0)(a1) -; RV64I-NEXT: lui a2, %hi(.LCPI82_1) -; RV64I-NEXT: ld a2, %lo(.LCPI82_1)(a2) +; RV64I-NEXT: lui a1, %hi(.LCPI84_0) +; RV64I-NEXT: ld a1, %lo(.LCPI84_0)(a1) +; RV64I-NEXT: lui a2, %hi(.LCPI84_1) +; RV64I-NEXT: ld a2, %lo(.LCPI84_1)(a2) ; RV64I-NEXT: slli a3, a0, 4 -; RV64I-NEXT: lui a4, %hi(.LCPI82_2) -; RV64I-NEXT: ld a4, %lo(.LCPI82_2)(a4) +; RV64I-NEXT: lui a4, %hi(.LCPI84_2) +; RV64I-NEXT: ld a4, %lo(.LCPI84_2)(a4) ; RV64I-NEXT: and a2, a3, a2 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: srli a0, a0, 4