diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -956,6 +956,7 @@ SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1025,6 +1025,10 @@ setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); + setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); @@ -4723,8 +4727,7 @@ case ISD::ABS: return LowerABS(Op, DAG); case ISD::BITREVERSE: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, - /*OverrideNEON=*/true); + return LowerBitreverse(Op, DAG); case ISD::BSWAP: return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); case ISD::CTLZ: @@ -6898,6 +6901,56 @@ return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); } +SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, + true); + + SDLoc DL(Op); + SDValue REVB; + MVT VST; + + switch (VT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Invalid type for bitreverse!"); + + case MVT::v2i32: { + VST = MVT::v8i8; + REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); + + break; + } + + case MVT::v4i32: { + VST = MVT::v16i8; + REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); + + break; + } + + case MVT::v1i64: { + VST = MVT::v8i8; + REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); + + break; + } + + case MVT::v2i64: { + VST = MVT::v16i8; + REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); + + break; + } + } + + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, + DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); +} + SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll --- a/llvm/test/CodeGen/AArch64/bitreverse.ll +++ b/llvm/test/CodeGen/AArch64/bitreverse.ll @@ -8,13 +8,8 @@ define <2 x i16> @f(<2 x i16> %a) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: rbit w8, w8 -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: rbit w8, w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: rev32 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: ret %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) @@ -118,14 +113,8 @@ define <2 x i32> @g_vec_2x32(<2 x i32> %a) { ; CHECK-LABEL: g_vec_2x32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: rbit w8, w8 -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: rbit w8, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: rev32 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ret %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) @@ -137,18 +126,8 @@ define <4 x i32> @g_vec_4x32(<4 x i32> %a) { ; CHECK-LABEL: g_vec_4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: rbit w10, w10 -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w11, v0.s[3] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: rbit w8, w8 -; CHECK-NEXT: rbit w9, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: rbit w8, w11 -; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: rev32 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: ret %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b @@ -159,10 +138,8 @@ define <1 x i64> @g_vec_1x64(<1 x i64> %a) { ; CHECK-LABEL: g_vec_1x64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: rbit x8, x8 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: rev64 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ret %b = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %a) ret <1 x i64> %b @@ -173,12 +150,8 @@ define <2 x i64> @g_vec_2x64(<2 x i64> %a) { ; CHECK-LABEL: g_vec_2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: rbit x8, x8 -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: rbit x8, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: rev64 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: ret %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b diff --git a/llvm/test/CodeGen/AArch64/neon_rbit.ll b/llvm/test/CodeGen/AArch64/neon_rbit.ll --- a/llvm/test/CodeGen/AArch64/neon_rbit.ll +++ b/llvm/test/CodeGen/AArch64/neon_rbit.ll @@ -55,14 +55,8 @@ define <2 x i32> @rbit_2x32(<2 x i32> %A) { ; CHECK-LABEL: rbit_2x32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: rbit w8, w8 -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: rbit w8, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: rev32 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.rbit.v2i32(<2 x i32> %A) ret <2 x i32> %tmp3 @@ -73,18 +67,8 @@ define <4 x i32> @rbit_4x32(<4 x i32> %A) { ; CHECK-LABEL: rbit_4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: rbit w10, w10 -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w11, v0.s[3] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: rbit w8, w8 -; CHECK-NEXT: rbit w9, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: rbit w8, w11 -; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: rev32 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.rbit.v4i32(<4 x i32> %A) ret <4 x i32> %tmp3 @@ -95,10 +79,8 @@ define <1 x i64> @rbit_1x64(<1 x i64> %A) { ; CHECK-LABEL: rbit_1x64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: rbit x8, x8 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: rev64 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.rbit.v1i64(<1 x i64> %A) ret <1 x i64> %tmp3 @@ -109,12 +91,8 @@ define <2 x i64> @rbit_2x64(<2 x i64> %A) { ; CHECK-LABEL: rbit_2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: rbit x8, x8 -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: rbit x8, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: rev64 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.rbit.v2i64(<2 x i64> %A) ret <2 x i64> %tmp3