diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -570,10 +570,20 @@ // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); + + // Vector rotations of both flavors are custom expanded with + // shift-insert instructions. for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); } + setOperationAction(ISD::ROTR, MVT::v8i8, Custom); + setOperationAction(ISD::ROTR, MVT::v16i8, Custom); + setOperationAction(ISD::ROTR, MVT::v4i16, Custom); + setOperationAction(ISD::ROTR, MVT::v8i16, Custom); + setOperationAction(ISD::ROTR, MVT::v2i32, Custom); + setOperationAction(ISD::ROTR, MVT::v4i32, Custom); + setOperationAction(ISD::ROTR, MVT::v1i64, Custom); + setOperationAction(ISD::ROTR, MVT::v2i64, Custom); // AArch64 doesn't have i32 MULH{S|U}. setOperationAction(ISD::MULHU, MVT::i32, Expand); @@ -6106,6 +6116,34 @@ return Result; } + case ISD::ROTR: { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector()); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + APInt Splat; + if (!ISD::isConstantSplatVector(Op1.getNode(), Splat)) + return SDValue(); + + APInt ShlAmt; + APInt SriAmt; + uint64_t LaneWidth = VT.getVectorElementType().getFixedSizeInBits(); + if (Splat.isNegative()) { + ShlAmt = (-Splat).zextOrTrunc(32); + SriAmt = LaneWidth - ShlAmt; + } else { + ShlAmt = LaneWidth - Splat.zextOrTrunc(32); + SriAmt = Splat.zextOrTrunc(32); + } + + SDValue Shl = DAG.getNode(AArch64ISD::VSHL, DL, VT, Op0, + DAG.getConstant(ShlAmt, DL, MVT::i32)); + SDValue Sri = DAG.getNode(AArch64ISD::VSRI, DL, VT, Shl, Op0, + DAG.getConstant(SriAmt, DL, MVT::i32)); + return Sri; + } } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1291,10 +1291,6 @@ def : Pat<(v2i64 (int_aarch64_crypto_xar (v2i64 V128:$Vn), (v2i64 V128:$Vm), (i64 timm0_63:$imm))), (XAR (v2i64 V128:$Vn), (v2i64 V128:$Vm), (timm0_63:$imm))>; - -def : Pat<(xor (v2i64 V128:$Vn), (or (AArch64vlshr (v2i64 V128:$Vm), (i32 63)), (AArch64vshl (v2i64 V128:$Vm), (i32 1)))), - (RAX1 (v2i64 V128:$Vn), (v2i64 V128:$Vm))>; - } // HasSHA3 let Predicates = [HasSM4] in { diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s declare i8 @llvm.fshl.i8(i8, i8, i8) @@ -83,7 +83,7 @@ ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s ; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z) ret <4 x i32> %f @@ -94,9 +94,9 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) { ; CHECK-LABEL: rotl_v4i32_rotl_const_shift: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v1.4s, v0.4s, #29 -; CHECK-NEXT: shl v0.4s, v0.4s, #3 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: shl v1.4s, v0.4s, #3 +; CHECK-NEXT: sri v1.4s, v0.4s, #29 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> ) ret <4 x i32> %f @@ -185,8 +185,8 @@ ; CHECK-LABEL: rotr_v4i32_const_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v1.4s, v0.4s, #29 -; CHECK-NEXT: ushr v0.4s, v0.4s, #3 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sri v1.4s, v0.4s, #3 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> ) ret <4 x i32> %f diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll --- a/llvm/test/CodeGen/AArch64/rax1.ll +++ b/llvm/test/CodeGen/AArch64/rax1.ll @@ -1,20 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 2 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s +; XFAIL: * define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) { +; NOSHA3-LABEL: rax1: +; NOSHA3: // %bb.0: +; NOSHA3-NEXT: add v2.2d, v1.2d, v1.2d +; NOSHA3-NEXT: sri v2.2d, v1.2d, #63 +; NOSHA3-NEXT: eor v0.16b, v0.16b, v2.16b +; NOSHA3-NEXT: ret +; ; SHA3-LABEL: rax1: ; SHA3: // %bb.0: ; SHA3-NEXT: rax1 v0.2d, v0.2d, v1.2d ; SHA3-NEXT: ret -; -; NOSHA3-LABEL: rax1: -; NOSHA3: // %bb.0: -; NOSHA3-NEXT: ushr v2.2d, v1.2d, #63 -; NOSHA3-NEXT: add v1.2d, v1.2d, v1.2d -; NOSHA3-NEXT: orr v1.16b, v1.16b, v2.16b -; NOSHA3-NEXT: eor v0.16b, v0.16b, v1.16b -; NOSHA3-NEXT: ret %a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> ) %b = xor <2 x i64> %x, %a ret <2 x i64> %b diff --git a/llvm/test/CodeGen/AArch64/rotate.ll b/llvm/test/CodeGen/AArch64/rotate.ll --- a/llvm/test/CodeGen/AArch64/rotate.ll +++ b/llvm/test/CodeGen/AArch64/rotate.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s ;; This used to cause a backend crash about not being able to @@ -6,10 +6,9 @@ define <2 x i64> @testcase(ptr %in) { ; CHECK-LABEL: testcase: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ushr v1.2d, v0.2d, #8 -; CHECK-NEXT: shl v0.2d, v0.2d, #56 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: shl v0.2d, v1.2d, #56 +; CHECK-NEXT: sri v0.2d, v1.2d, #8 ; CHECK-NEXT: ret %1 = load <2 x i64>, ptr %in %2 = lshr <2 x i64> %1, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s ; Odd+Even divisors @@ -35,8 +35,8 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 @@ -56,8 +56,8 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 @@ -79,21 +79,20 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: mov w8, #28087 // =0x6db7 +; CHECK-NEXT: mov w9, #9362 // =0x2492 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -103,21 +102,20 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: mov w8, #28087 // =0x6db7 +; CHECK-NEXT: mov w9, #9362 // =0x2492 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -271,8 +269,8 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 @@ -294,21 +292,20 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: mov w8, #28087 // =0x6db7 +; CHECK-NEXT: mov w9, #9362 // =0x2492 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] ; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -525,8 +522,8 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 @@ -548,21 +545,20 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: mov w8, #28087 // =0x6db7 +; CHECK-NEXT: mov w9, #9362 // =0x2492 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI20_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -1,17 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s ; Odd divisor define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-NEXT: mov w9, #47185 // =0xb851 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 +; CHECK-NEXT: mov w8, #28834 // =0x70a2 ; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 @@ -29,22 +29,21 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-NEXT: mov w9, #47184 // =0xb850 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 +; CHECK-NEXT: mov w8, #23592 // =0x5c28 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: movk w8, #655, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #2 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -58,13 +57,13 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_neg25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-NEXT: mov w9, #47185 // =0xb851 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 +; CHECK-NEXT: mov w8, #28834 // =0x70a2 ; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 @@ -82,22 +81,21 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_neg100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-NEXT: mov w9, #47184 // =0xb850 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 +; CHECK-NEXT: mov w8, #23592 // =0x5c28 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: movk w8, #655, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v0.4s, v2.4s, #2 +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -112,7 +110,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: dup v1.4s, w8 @@ -135,7 +133,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: dup v1.4s, w8 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s ; Odd+Even divisors @@ -251,7 +251,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 @@ -271,18 +271,17 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w8, #28087 // =0x6db7 ; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] ; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v1.4s, v0.4s, #1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s define <4 x i1> @t32_3(<4 x i32> %X) nounwind { @@ -6,7 +6,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 @@ -26,11 +26,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #13106 +; CHECK-NEXT: mov w8, #13106 // =0x3332 ; CHECK-NEXT: movk w8, #13107, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 @@ -47,18 +47,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #43690 +; CHECK-NEXT: mov w8, #43690 // =0xaaaa ; CHECK-NEXT: movk w8, #10922, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: sri v1.4s, v0.4s, #1 +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -70,18 +69,17 @@ ; CHECK-LABEL: t32_6_part1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #43690, lsl #16 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: sri v1.4s, v0.4s, #1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -94,7 +92,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -1,15 +1,15 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s ; Odd divisor define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #28835 +; CHECK-NEXT: mov w8, #28835 // =0x70a3 ; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 @@ -26,19 +26,18 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w8, #23593 // =0x5c29 ; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #23592 +; CHECK-NEXT: mov w8, #23592 // =0x5c28 ; CHECK-NEXT: movk w8, #655, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: shl v1.4s, v0.4s, #30 -; CHECK-NEXT: ushr v0.4s, v0.4s, #2 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v1.4s, v0.4s, #2 +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -72,16 +71,15 @@ ; CHECK-LABEL: test_urem_even_neg100: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] ; CHECK-NEXT: shl v1.4s, v0.4s, #30 -; CHECK-NEXT: ushr v0.4s, v0.4s, #2 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: sri v1.4s, v0.4s, #2 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -96,7 +94,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s @@ -118,7 +116,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/vector-rotate.ll b/llvm/test/CodeGen/AArch64/vector-rotate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-rotate.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 2 +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @rotl_v8i8(<8 x i8> %0, ptr %1) { +; CHECK-LABEL: rotl_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.8b, v0.8b, #2 +; CHECK-NEXT: sri v1.8b, v0.8b, #6 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <8 x i8> @llvm.fshl.v8i8(<8 x i8> %0, <8 x i8> %0, <8 x i8> ) + store <8 x i8> %3, ptr %1, align 8 + ret void +} + +define void @rotr_v8i8(<8 x i8> %0, ptr %1) { +; CHECK-LABEL: rotr_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.8b, v0.8b, #6 +; CHECK-NEXT: sri v1.8b, v0.8b, #2 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <8 x i8> @llvm.fshl.v8i8(<8 x i8> %0, <8 x i8> %0, <8 x i8> ) + store <8 x i8> %3, ptr %1, align 8 + ret void +} + +define void @rotl_v16i8(<16 x i8> %0, ptr %1) { +; CHECK-LABEL: rotl_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v0.16b, #2 +; CHECK-NEXT: sri v1.16b, v0.16b, #6 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %0, <16 x i8> %0, <16 x i8> ) + store <16 x i8> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v16i8(<16 x i8> %0, ptr %1) { +; CHECK-LABEL: rotr_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v0.16b, #6 +; CHECK-NEXT: sri v1.16b, v0.16b, #2 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %0, <16 x i8> %0, <16 x i8> ) + store <16 x i8> %3, ptr %1, align 16 + ret void +} + +define void @rotl_v4i16(<4 x i16> %0, ptr %1) { +; CHECK-LABEL: rotl_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.4h, v0.4h, #2 +; CHECK-NEXT: sri v1.4h, v0.4h, #14 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %0, <4 x i16> %0, <4 x i16> ) + store <4 x i16> %3, ptr %1, align 8 + ret void +} + +define void @rotr_v4i16(<4 x i16> %0, ptr %1) { +; CHECK-LABEL: rotr_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.4h, v0.4h, #14 +; CHECK-NEXT: sri v1.4h, v0.4h, #2 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %0, <4 x i16> %0, <4 x i16> ) + store <4 x i16> %3, ptr %1, align 8 + ret void +} + +define void @rotl_v8i16(<8 x i16> %0, ptr %1) { +; CHECK-LABEL: rotl_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.8h, v0.8h, #2 +; CHECK-NEXT: sri v1.8h, v0.8h, #14 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %0, <8 x i16> ) + store <8 x i16> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v8i16(<8 x i16> %0, ptr %1) { +; CHECK-LABEL: rotr_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.8h, v0.8h, #14 +; CHECK-NEXT: sri v1.8h, v0.8h, #2 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %0, <8 x i16> ) + store <8 x i16> %3, ptr %1, align 16 + ret void +} + +define void @rotl_v2i32(<2 x i32> %0, ptr %1) { +; CHECK-LABEL: rotl_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.2s, v0.2s, #2 +; CHECK-NEXT: sri v1.2s, v0.2s, #30 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %0, <2 x i32> %0, <2 x i32> ) + store <2 x i32> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v2i32(<2 x i32> %0, ptr %1) { +; CHECK-LABEL: rotr_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.2s, v0.2s, #30 +; CHECK-NEXT: sri v1.2s, v0.2s, #2 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %0, <2 x i32> %0, <2 x i32> ) + store <2 x i32> %3, ptr %1, align 16 + ret void +} + +define void @rotl_v4i32(<4 x i32> %0, ptr %1) { +; CHECK-LABEL: rotl_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.4s, v0.4s, #2 +; CHECK-NEXT: sri v1.4s, v0.4s, #30 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> ) + store <4 x i32> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v4i32(<4 x i32> %0, ptr %1) { +; CHECK-LABEL: rotr_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.4s, v0.4s, #30 +; CHECK-NEXT: sri v1.4s, v0.4s, #2 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> ) + store <4 x i32> %3, ptr %1, align 16 + ret void +} + +define void @rotl_v1i64(<1 x i64> %0, ptr %1) { +; CHECK-LABEL: rotl_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: shl d1, d0, #2 +; CHECK-NEXT: sri d1, d0, #62 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %0, <1 x i64> %0, <1 x i64> ) + store <1 x i64> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v1i64(<1 x i64> %0, ptr %1) { +; CHECK-LABEL: rotr_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: shl d1, d0, #62 +; CHECK-NEXT: sri d1, d0, #2 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %3 = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %0, <1 x i64> %0, <1 x i64> ) + store <1 x i64> %3, ptr %1, align 16 + ret void +} + +define void @rotl_v2i64(<2 x i64> %0, ptr %1) { +; CHECK-LABEL: rotl_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.2d, v0.2d, #2 +; CHECK-NEXT: sri v1.2d, v0.2d, #62 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %0, <2 x i64> %0, <2 x i64> ) + store <2 x i64> %3, ptr %1, align 16 + ret void +} + +define void @rotr_v2i64(<2 x i64> %0, ptr %1) { +; CHECK-LABEL: rotr_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.2d, v0.2d, #62 +; CHECK-NEXT: sri v1.2d, v0.2d, #2 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %3 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %0, <2 x i64> %0, <2 x i64> ) + store <2 x i64> %3, ptr %1, align 16 + ret void +} + +declare <8 x i8> @llvm.fshl.v8i8(<8 x i8> , <8 x i8> , <8 x i8> ) #0 +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8> , <16 x i8> , <16 x i8> ) #0 +declare <4 x i16> @llvm.fshl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #0 +declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) #0 +declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #0 +declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #0 +declare <1 x i64> @llvm.fshl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) #0 +declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }