diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10879,7 +10879,7 @@ } case NEON::BI__builtin_neon_vrbit_v: case NEON::BI__builtin_neon_vrbitq_v: { - Int = Intrinsic::aarch64_neon_rbit; + Int = Intrinsic::bitreverse; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit"); } case NEON::BI__builtin_neon_vaddv_u8: diff --git a/clang/test/CodeGen/aarch64-neon-misc.c b/clang/test/CodeGen/aarch64-neon-misc.c --- a/clang/test/CodeGen/aarch64-neon-misc.c +++ b/clang/test/CodeGen/aarch64-neon-misc.c @@ -1766,42 +1766,42 @@ } // CHECK-LABEL: @test_vrbit_s8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) // CHECK: ret <8 x i8> [[VRBIT_I]] int8x8_t test_vrbit_s8(int8x8_t a) { return vrbit_s8(a); } // CHECK-LABEL: @test_vrbitq_s8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) // CHECK: ret <16 x i8> [[VRBIT_I]] int8x16_t test_vrbitq_s8(int8x16_t a) { return vrbitq_s8(a); } // CHECK-LABEL: @test_vrbit_u8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) // CHECK: ret <8 x i8> [[VRBIT_I]] uint8x8_t test_vrbit_u8(uint8x8_t a) { return vrbit_u8(a); } // CHECK-LABEL: @test_vrbitq_u8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) // CHECK: ret <16 x i8> [[VRBIT_I]] uint8x16_t test_vrbitq_u8(uint8x16_t a) { return vrbitq_u8(a); } // CHECK-LABEL: @test_vrbit_p8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) // CHECK: ret <8 x i8> [[VRBIT_I]] poly8x8_t test_vrbit_p8(poly8x8_t a) { return vrbit_p8(a); } // CHECK-LABEL: @test_vrbitq_p8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) +// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) // CHECK: ret <16 x i8> [[VRBIT_I]] poly8x16_t test_vrbitq_p8(poly8x16_t a) { return vrbitq_p8(a); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -444,9 +444,6 @@ def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic; def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic; - // Vector Bitwise Reverse - def int_aarch64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic; - // Vector Conversions Between Half-Precision and Single-Precision. def int_aarch64_neon_vcvtfp2hf : DefaultAttrsIntrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -553,6 +553,11 @@ F->arg_begin()->getType()); return true; } + if (Name.startswith("aarch64.neon.rbit")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse, + F->arg_begin()->getType()); + return true; + } if (Name.startswith("arm.neon.vclz")) { Type* args[2] = { F->arg_begin()->getType(), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1023,6 +1023,8 @@ setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); + setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); + setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4166,7 +4166,7 @@ def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>; +defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", bitreverse>; defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; diff --git a/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll b/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll --- a/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll @@ -4,7 +4,7 @@ ;CHECK-LABEL: rbit_8b: ;CHECK: rbit.8b %tmp1 = load <8 x i8>, <8 x i8>* %A - %tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1) + %tmp3 = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %tmp1) ret <8 x i8> %tmp3 } @@ -12,12 +12,12 @@ ;CHECK-LABEL: rbit_16b: ;CHECK: rbit.16b %tmp1 = load <16 x i8>, <16 x i8>* %A - %tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1) + %tmp3 = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %tmp1) ret <16 x i8> %tmp3 } -declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone +declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) nounwind readnone +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) nounwind readnone define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind { ;CHECK-LABEL: sxtl8h: diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll --- a/llvm/test/CodeGen/AArch64/bitreverse.ll +++ b/llvm/test/CodeGen/AArch64/bitreverse.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s ; These tests just check that the plumbing is in place for @llvm.bitreverse. @@ -6,13 +7,16 @@ define <2 x i16> @f(<2 x i16> %a) { ; CHECK-LABEL: f: -; CHECK: fmov [[REG1:w[0-9]+]], s0 -; CHECK-DAG: rbit [[REG2:w[0-9]+]], [[REG1]] -; CHECK-DAG: fmov s0, [[REG2]] -; CHECK-DAG: mov [[REG3:w[0-9]+]], v0.s[1] -; CHECK-DAG: rbit [[REG4:w[0-9]+]], [[REG3]] -; CHECK-DAG: mov v0.s[1], [[REG4]] -; CHECK-DAG: ushr v0.2s, v0.2s, #16 +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: rbit w8, w8 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: rbit w8, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-NEXT: ret %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) ret <2 x i16> %b } @@ -21,41 +25,161 @@ define i8 @g(i8 %a) { ; CHECK-LABEL: g: -; CHECK: rbit [[REG:w[0-9]+]], w0 -; CHECK-NEXT: lsr w0, [[REG]], #24 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: rbit w8, w0 +; CHECK-NEXT: lsr w0, w8, #24 +; CHECK-NEXT: ret %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b } +declare i16 @llvm.bitreverse.i16(i16) readnone + +define i16 @g_16(i16 %a) { +; CHECK-LABEL: g_16: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit w8, w0 +; CHECK-NEXT: lsr w0, w8, #16 +; CHECK-NEXT: ret + %b = call i16 @llvm.bitreverse.i16(i16 %a) + ret i16 %b +} + +declare i32 @llvm.bitreverse.i32(i32) readnone + +define i32 @g_32(i32 %a) { +; CHECK-LABEL: g_32: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit w0, w0 +; CHECK-NEXT: ret + %b = call i32 @llvm.bitreverse.i32(i32 %a) + ret i32 %b +} + +declare i64 @llvm.bitreverse.i64(i64) readnone + +define i64 @g_64(i64 %a) { +; CHECK-LABEL: g_64: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit x0, x0 +; CHECK-NEXT: ret + %b = call i64 @llvm.bitreverse.i64(i64 %a) + ret i64 %b +} + declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) readnone define <8 x i8> @g_vec(<8 x i8> %a) { -; CHECK-DAG: movi [[M1:v.*]], #15 -; CHECK-DAG: movi [[M2:v.*]], #240 -; CHECK: and [[A1:v.*]], v0.8b, [[M1]] -; CHECK: and [[A2:v.*]], v0.8b, [[M2]] -; CHECK-DAG: shl [[L4:v.*]], [[A1]], #4 -; CHECK-DAG: ushr [[R4:v.*]], [[A2]], #4 -; CHECK-DAG: orr [[V4:v.*]], [[R4]], [[L4]] - -; CHECK-DAG: movi [[M3:v.*]], #51 -; CHECK-DAG: movi [[M4:v.*]], #204 -; CHECK: and [[A3:v.*]], [[V4]], [[M3]] -; CHECK: and [[A4:v.*]], [[V4]], [[M4]] -; CHECK-DAG: shl [[L2:v.*]], [[A3]], #2 -; CHECK-DAG: ushr [[R2:v.*]], [[A4]], #2 -; CHECK-DAG: orr [[V2:v.*]], [[R2]], [[L2]] - -; CHECK-DAG: movi [[M5:v.*]], #85 -; CHECK-DAG: movi [[M6:v.*]], #170 -; CHECK: and [[A5:v.*]], [[V2]], [[M5]] -; CHECK: and [[A6:v.*]], [[V2]], [[M6]] -; CHECK-DAG: shl [[L1:v.*]], [[A5]], #1 -; CHECK-DAG: ushr [[R1:v.*]], [[A6]], #1 -; CHECK: orr [[V1:v.*]], [[R1]], [[L1]] - -; CHECK: ret +; CHECK-LABEL: g_vec: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit v0.8b, v0.8b +; CHECK-NEXT: ret %b = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) ret <8 x i8> %b } + +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone + +define <16 x i8> @g_vec_16x8(<16 x i8> %a) { +; CHECK-LABEL: g_vec_16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit v0.16b, v0.16b +; CHECK-NEXT: ret + %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) + ret <16 x i8> %b +} + +declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>) readnone + +define <4 x i16> @g_vec_4x16(<4 x i16> %a) { +; CHECK-LABEL: g_vec_4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b +; CHECK-NEXT: ret + %b = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a) + ret <4 x i16> %b +} + +declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone + +define <8 x i16> @g_vec_8x16(<8 x i16> %a) { +; CHECK-LABEL: g_vec_8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b +; CHECK-NEXT: ret + %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) + ret <8 x i16> %b +} + +declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) readnone + +define <2 x i32> @g_vec_2x32(<2 x i32> %a) { +; CHECK-LABEL: g_vec_2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: rbit w8, w8 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: rbit w8, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + + %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) + ret <2 x i32> %b +} + +declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone + +define <4 x i32> @g_vec_4x32(<4 x i32> %a) { +; CHECK-LABEL: g_vec_4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: rbit w10, w10 +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w11, v0.s[3] +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: rbit w8, w8 +; CHECK-NEXT: rbit w9, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: rbit w8, w11 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ret + %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) + ret <4 x i32> %b +} + +declare <1 x i64> @llvm.bitreverse.v1i64(<1 x i64>) readnone + +define <1 x i64> @g_vec_1x64(<1 x i64> %a) { +; CHECK-LABEL: g_vec_1x64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: rbit x8, x8 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %b = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %a) + ret <1 x i64> %b +} + +declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone + +define <2 x i64> @g_vec_2x64(<2 x i64> %a) { +; CHECK-LABEL: g_vec_2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: rbit x8, x8 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: rbit x8, x9 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret + %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) + ret <2 x i64> %b +} diff --git a/llvm/test/CodeGen/AArch64/neon_rbit.ll b/llvm/test/CodeGen/AArch64/neon_rbit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon_rbit.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-eabi -mattr=+fullfp16 %s -o - | FileCheck %s + +; The llvm.aarch64_neon_rbit intrinsic should be auto-upgraded to the +; target-independent bitreverse intrinsic. + +declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone + +define <8 x i8> @rbit_8x8(<8 x i8> %A) nounwind { +; CHECK-LABEL: rbit_8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit v0.8b, v0.8b +; CHECK-NEXT: ret + %tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %A) + ret <8 x i8> %tmp3 +} + +declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone + +define <16 x i8> @rbit_16x8(<16 x i8> %A) nounwind { +; CHECK-LABEL: rbit_16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: rbit v0.16b, v0.16b +; CHECK-NEXT: ret + %tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %A) + ret <16 x i8> %tmp3 +} + +declare <4 x i16> @llvm.aarch64.neon.rbit.v4i16(<4 x i16>) nounwind readnone + +define <4 x i16> @rbit_4x16(<4 x i16> %A) nounwind { +; CHECK-LABEL: rbit_4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.8b, v0.8b +; CHECK-NEXT: rbit v0.8b, v0.8b +; CHECK-NEXT: ret + %tmp3 = call <4 x i16> @llvm.aarch64.neon.rbit.v4i16(<4 x i16> %A) + ret <4 x i16> %tmp3 +} + +declare <8 x i16> @llvm.aarch64.neon.rbit.v8i16(<8 x i16>) nounwind readnone + +define <8 x i16> @rbit_8x16(<8 x i16> %A) { +; CHECK-LABEL: rbit_8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.16b, v0.16b +; CHECK-NEXT: rbit v0.16b, v0.16b +; CHECK-NEXT: ret + %tmp3 = call <8 x i16> @llvm.aarch64.neon.rbit.v8i16(<8 x i16> %A) + ret <8 x i16> %tmp3 +} + +declare <2 x i32> @llvm.aarch64.neon.rbit.v2i32(<2 x i32>) nounwind readnone + +define <2 x i32> @rbit_2x32(<2 x i32> %A) { +; CHECK-LABEL: rbit_2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: rbit w8, w8 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: rbit w8, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %tmp3 = call <2 x i32> @llvm.aarch64.neon.rbit.v2i32(<2 x i32> %A) + ret <2 x i32> %tmp3 +} + +declare <4 x i32> @llvm.aarch64.neon.rbit.v4i32(<4 x i32>) nounwind readnone + +define <4 x i32> @rbit_4x32(<4 x i32> %A) { +; CHECK-LABEL: rbit_4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: rbit w10, w10 +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w11, v0.s[3] +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: rbit w8, w8 +; CHECK-NEXT: rbit w9, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: rbit w8, w11 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ret + %tmp3 = call <4 x i32> @llvm.aarch64.neon.rbit.v4i32(<4 x i32> %A) + ret <4 x i32> %tmp3 +} + +declare <1 x i64> @llvm.aarch64.neon.rbit.v1i64(<1 x i64>) readnone + +define <1 x i64> @rbit_1x64(<1 x i64> %A) { +; CHECK-LABEL: rbit_1x64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: rbit x8, x8 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.rbit.v1i64(<1 x i64> %A) + ret <1 x i64> %tmp3 +} + +declare <2 x i64> @llvm.aarch64.neon.rbit.v2i64(<2 x i64>) readnone + +define <2 x i64> @rbit_2x64(<2 x i64> %A) { +; CHECK-LABEL: rbit_2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: rbit x8, x8 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: rbit x8, x9 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret + %tmp3 = call <2 x i64> @llvm.aarch64.neon.rbit.v2i64(<2 x i64> %A) + ret <2 x i64> %tmp3 +}