diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1564,11 +1564,11 @@ // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. - CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); + CombinedNodes.insert(N); SDValue RV = combine(N); if (!RV.getNode()) @@ -1602,10 +1602,8 @@ // out), because re-visiting the EntryToken and its users will not uncover // any additional opportunities, but there may be a large number of such // users, potentially causing compile time explosion. - if (RV.getOpcode() != ISD::EntryToken) { - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); - } + if (RV.getOpcode() != ISD::EntryToken) + AddToWorklistWithUsers(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1153,7 +1153,15 @@ ; CHECK-LABEL: testDUP.v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[1], v0.b[0] +; CHECK-NEXT: mov v1.b[2], v0.b[0] +; CHECK-NEXT: mov v1.b[3], v0.b[0] +; CHECK-NEXT: mov v1.b[4], v0.b[0] +; CHECK-NEXT: mov v1.b[5], v0.b[0] +; CHECK-NEXT: mov v1.b[6], v0.b[0] +; CHECK-NEXT: mov v1.b[7], v0.b[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %b = extractelement <1 x i8> %a, i32 0 %c = insertelement <8 x i8> undef, i8 %b, i32 0 @@ -1171,7 +1179,15 @@ ; CHECK-LABEL: testDUP.v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: mov v1.h[4], v0.h[0] +; CHECK-NEXT: mov v1.h[5], v0.h[0] +; CHECK-NEXT: mov v1.h[6], v0.h[0] +; CHECK-NEXT: mov v1.h[7], v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i16> %a, i32 0 %c = insertelement <8 x i16> undef, i16 %b, i32 0 @@ -1189,7 +1205,11 @@ ; CHECK-LABEL: testDUP.v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: mov v1.s[2], v0.s[0] +; CHECK-NEXT: mov v1.s[3], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i32> %a, i32 0 %c = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -1202,7 +1222,14 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[2], v0.b[2] +; CHECK-NEXT: mov v1.b[3], v0.b[3] +; CHECK-NEXT: mov v1.b[4], v0.b[4] +; CHECK-NEXT: mov v1.b[5], v0.b[5] +; CHECK-NEXT: mov v1.b[6], v0.b[6] +; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 @@ -1318,7 +1345,11 @@ ; CHECK-LABEL: test_dup_v1i64_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1334,7 +1365,9 @@ ; CHECK-LABEL: test_dup_v1i64_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1396,7 +1429,11 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1411,7 +1448,9 @@ define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1480,7 +1519,8 @@ ; CHECK-LABEL: test_concat_same_v1i32_v1i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %0 = extractelement <2 x i32> %a, i32 0 @@ -1523,7 +1563,16 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1550,7 +1599,14 @@ ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.b[8], v1.b[0] +; CHECK-NEXT: mov v0.b[9], v1.b[1] +; CHECK-NEXT: mov v0.b[10], v1.b[2] +; CHECK-NEXT: mov v0.b[11], v1.b[3] +; CHECK-NEXT: mov v0.b[12], v1.b[4] +; CHECK-NEXT: mov v0.b[13], v1.b[5] +; CHECK-NEXT: mov v0.b[14], v1.b[6] +; CHECK-NEXT: mov v0.b[15], v1.b[7] ; CHECK-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 @@ -1592,8 +1648,24 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[8], v1.b[0] +; CHECK-NEXT: mov v2.b[9], v1.b[1] +; CHECK-NEXT: mov v2.b[10], v1.b[2] +; CHECK-NEXT: mov v2.b[11], v1.b[3] +; CHECK-NEXT: mov v2.b[12], v1.b[4] +; CHECK-NEXT: mov v2.b[13], v1.b[5] +; CHECK-NEXT: mov v2.b[14], v1.b[6] +; CHECK-NEXT: mov v2.b[15], v1.b[7] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1645,7 +1717,12 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1664,7 +1741,10 @@ ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.h[4], v1.h[0] +; CHECK-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NEXT: mov v0.h[6], v1.h[2] +; CHECK-NEXT: mov v0.h[7], v1.h[3] ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1690,8 +1770,16 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.h[4], v1.h[0] +; CHECK-NEXT: mov v2.h[5], v1.h[1] +; CHECK-NEXT: mov v2.h[6], v1.h[2] +; CHECK-NEXT: mov v2.h[7], v1.h[3] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1727,6 +1815,7 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v0.s[1] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: @@ -1742,7 +1831,8 @@ ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: mov v0.s[3], v1.s[1] ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -2546,7 +2546,15 @@ ; CHECK-LABEL: vmulq_built_dup_fromsmall_test: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.8h v0, v0, v1[0] +; CHECK-NEXT: mov.16b v2, v1 +; CHECK-NEXT: mov.h v2[1], v1[0] +; CHECK-NEXT: mov.h v2[2], v1[0] +; CHECK-NEXT: mov.h v2[3], v1[0] +; CHECK-NEXT: mov.h v2[4], v1[0] +; CHECK-NEXT: mov.h v2[5], v1[0] +; CHECK-NEXT: mov.h v2[6], v1[0] +; CHECK-NEXT: mov.h v2[7], v1[0] +; CHECK-NEXT: mul.8h v0, v0, v2 ; CHECK-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 0 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -559,19 +559,17 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: adrp x9, .LCPI14_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v4.16b, v3.16b +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b -; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b -; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h -; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: fmov d0, d4 ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 @@ -629,17 +627,25 @@ define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v4.16b, v3.16b -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: mov v4.b[8], v0.b[4] +; CHECK-NEXT: mov v4.b[9], v2.b[0] +; CHECK-NEXT: mov v4.b[10], v1.b[15] +; CHECK-NEXT: mov v4.b[11], v3.b[11] +; CHECK-NEXT: mov v4.b[12], v2.b[6] +; CHECK-NEXT: mov v4.b[13], v0.b[3] +; CHECK-NEXT: mov v4.b[14], v3.b[8] +; CHECK-NEXT: mov v4.b[15], v1.b[12] +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s target triple = "aarch64" @@ -10,10 +11,19 @@ define <8 x i32> @fixed_bitselect_v8i32(<8 x i32>* %pre_cond_ptr, <8 x i32>* %left_ptr, <8 x i32>* %right_ptr) #0 { ; CHECK-LABEL: fixed_bitselect_v8i32: -; CHECK-NOT: bsl {{.*}}, {{.*}}, {{.*}} -; CHECK-NOT: bit {{.*}}, {{.*}}, {{.*}} -; CHECK-NOT: bif {{.*}}, {{.*}}, {{.*}} -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] +; CHECK-NEXT: add z3.s, z0.s, z3.s +; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: and z1.d, z3.d, z2.d +; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %pre_cond = load <8 x i32>, <8 x i32>* %pre_cond_ptr %left = load <8 x i32>, <8 x i32>* %left_ptr %right = load <8 x i32>, <8 x i32>* %right_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK @@ -21,198 +22,994 @@ ; NO_SVE-NOT: ptrue define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v4i16i32 - ; CHECK: ldr d[[D0:[0-9]+]], [x0] - ; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0 - ; CHECK-NEXT: ret +; NO_SVE-LABEL: load_zext_v4i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ret +; +; CHECK-LABEL: load_zext_v4i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %ap %val = zext <4 x i16> %a to <4 x i32> ret <4 x i32> %val } define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v8i16i32 - ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 - ; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; CHECK-NEXT: ret +; NO_SVE-LABEL: load_zext_v8i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ret +; +; CHECK-LABEL: load_zext_v8i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %ap %val = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %val } define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v16i16i32 - ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 - ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: load_zext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v16i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: load_zext_v16i16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_zext_v16i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v16i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl16 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret ; Ensure sensible type legalistaion - ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 - ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] - ; VBITS_EQ_256-DAG: mov x9, #8 - ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 - ; VBITS_EQ_256-DAG: uunpklo [[R0:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: ext [[Z0]].b, [[Z0]].b, [[Z0]].b, #16 - ; VBITS_EQ_256-DAG: uunpklo [[R1:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: st1w { [[R1]].s }, [[PG1]], [x8, x9, lsl #2] - ; VBITS_EQ_256-DAG: st1w { [[R0]].s }, [[PG1]], [x8] - ; VBITS_EQ_256-DAG: ret %a = load <16 x i16>, <16 x i16>* %ap %val = zext <16 x i16> %a to <16 x i32> ret <16 x i32> %val } define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i16i32 - ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 - ; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: load_zext_v32i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ldp q4, q6, [x0, #32] +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v4.8h, #0 +; NO_SVE-NEXT: ushll v4.4s, v4.4h, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v6.8h, #0 +; NO_SVE-NEXT: ushll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v32i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_zext_v32i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = zext <32 x i16> %a to <32 x i32> ret <32 x i32> %val } define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v64i16i32 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 - ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_zext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ushll2 v17.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ushll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q16, q6, [x0] +; NO_SVE-NEXT: stp q1, q17, [x8, #192] +; NO_SVE-NEXT: stp q0, q7, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.4s, v3.4h, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: ushll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #96] +; NO_SVE-NEXT: ushll v2.4s, v5.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v6.8h, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: ushll v0.4s, v6.4h, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v16.8h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: ushll v1.4s, v16.4h, #0 +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v64i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: uunpklo z4.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z5.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z6.s, z2.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z7.s, z3.h +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %ap %val = zext <64 x i16> %a to <64 x i32> ret <64 x i32> %val } define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v4i16i32 - ; CHECK: ldr d[[D0:[0-9]+]], [x0] - ; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0 - ; CHECK-NEXT: ret +; NO_SVE-LABEL: load_sext_v4i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ret +; +; CHECK-LABEL: load_sext_v4i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %ap %val = sext <4 x i16> %a to <4 x i32> ret <4 x i32> %val } define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v8i16i32 - ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 - ; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; CHECK-NEXT: ret +; NO_SVE-LABEL: load_sext_v8i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ret +; +; CHECK-LABEL: load_sext_v8i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %ap %val = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %val } define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v16i16i32 - ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 - ; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: load_sext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v16i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: load_sext_v16i16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_sext_v16i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v16i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl16 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret ; Ensure sensible type legalistaion - ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 - ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] - ; VBITS_EQ_256-DAG: mov x9, #8 - ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 - ; VBITS_EQ_256-DAG: sunpklo [[R0:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: ext [[Z0]].b, [[Z0]].b, [[Z0]].b, #16 - ; VBITS_EQ_256-DAG: sunpklo [[R1:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: st1w { [[R1]].s }, [[PG1]], [x8, x9, lsl #2] - ; VBITS_EQ_256-DAG: st1w { [[R0]].s }, [[PG1]], [x8] - ; VBITS_EQ_256-DAG: ret %a = load <16 x i16>, <16 x i16>* %ap %val = sext <16 x i16> %a to <16 x i32> ret <16 x i32> %val } define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i16i32 - ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 - ; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: load_sext_v32i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ldp q4, q6, [x0, #32] +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v4.8h, #0 +; NO_SVE-NEXT: sshll v4.4s, v4.4h, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v6.8h, #0 +; NO_SVE-NEXT: sshll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v32i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_sext_v32i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = sext <32 x i16> %a to <32 x i32> ret <32 x i32> %val } define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v64i16i32 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 - ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_sext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: sshll2 v17.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: sshll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q16, q6, [x0] +; NO_SVE-NEXT: stp q1, q17, [x8, #192] +; NO_SVE-NEXT: stp q0, q7, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.4s, v3.4h, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: sshll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #96] +; NO_SVE-NEXT: sshll v2.4s, v5.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v6.8h, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: sshll v0.4s, v6.4h, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v16.8h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: sshll v1.4s, v16.4h, #0 +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v64i16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z2.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.s, z3.h +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %ap %val = sext <64 x i16> %a to <64 x i32> ret <64 x i32> %val } define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i8i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_zext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ushll v2.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v3.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: ushll v6.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q1, q16, [x8, #224] +; NO_SVE-NEXT: ushll2 v1.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q6, q1, [x8, #192] +; NO_SVE-NEXT: ushll2 v1.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q5, q1, [x8, #160] +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v32i8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_EQ_256-NEXT: ushll v1.8h, v0.8b, #0 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: ushll2 v4.8h, v0.16b, #0 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z4.h +; VBITS_EQ_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z3.s, z4.h +; VBITS_EQ_256-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: uunpklo z0.d, z2.s +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: uunpklo z0.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z3.h +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %val = zext <32 x i8> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i8i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_sext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: sshll v2.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v3.8h, v1.8b, #0 +; NO_SVE-NEXT: sshll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: sshll v6.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q1, q16, [x8, #224] +; NO_SVE-NEXT: sshll2 v1.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q6, q1, [x8, #192] +; NO_SVE-NEXT: sshll2 v1.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q5, q1, [x8, #160] +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v32i8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_EQ_256-NEXT: sshll v1.8h, v0.8b, #0 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sshll2 v4.8h, v0.16b, #0 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z4.h +; VBITS_EQ_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z4.h +; VBITS_EQ_256-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z2.s +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z0.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %val = sext <32 x i8> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i16i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_zext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ushll v4.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: ushll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v7.4s, #0 +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v6.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q7, q16, [x8, #224] +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v7.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #160] +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q6, q7, [x8, #96] +; NO_SVE-NEXT: ushll2 v1.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q5, q1, [x8, #32] +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v32i16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z3.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: uunpklo z4.s, z1.h +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: uunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z0.s, z3.h +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z6.h +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z4.s +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = zext <32 x i16> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i16i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_sext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: sshll v4.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: sshll2 v7.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v7.4s, #0 +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v6.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q7, q16, [x8, #224] +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v7.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #160] +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q6, q7, [x8, #96] +; NO_SVE-NEXT: sshll2 v1.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q5, q1, [x8, #32] +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v32i16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z1.h +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z0.s, z3.h +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z6.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z4.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = sext <32 x i16> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i32i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_zext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ushll2 v17.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ushll2 v7.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q16, q6, [x0] +; NO_SVE-NEXT: stp q1, q17, [x8, #192] +; NO_SVE-NEXT: stp q0, q7, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v3.4s, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll2 v2.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: ushll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #96] +; NO_SVE-NEXT: ushll v2.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v6.4s, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: ushll v0.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll2 v2.2d, v16.4s, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: ushll v1.2d, v16.2s, #0 +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_zext_v32i32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x12, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: uunpklo z4.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z5.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z6.d, z2.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z7.d, z3.s +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %ap %val = zext <32 x i32> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i32i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: load_sext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: sshll2 v17.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: sshll2 v7.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q16, q6, [x0] +; NO_SVE-NEXT: stp q1, q17, [x8, #192] +; NO_SVE-NEXT: stp q0, q7, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v3.4s, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll2 v2.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #128] +; NO_SVE-NEXT: sshll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #96] +; NO_SVE-NEXT: sshll v2.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v6.4s, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: sshll v0.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll2 v2.2d, v16.4s, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #32] +; NO_SVE-NEXT: sshll v1.2d, v16.2s, #0 +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: load_sext_v32i32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x12, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z2.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.d, z3.s +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %ap %val = sext <32 x i32> %a to <32 x i64> ret <32 x i64> %val diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -24,6 +25,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b @@ -34,6 +40,12 @@ ; Don't use SVE for 128-bit vectors. define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -44,6 +56,12 @@ } define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 @@ -58,6 +76,12 @@ } define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v64i8: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov w8, #32 @@ -81,6 +105,26 @@ } define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -96,6 +140,38 @@ } define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v256i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v256i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #128 +; VBITS_EQ_256-NEXT: mov w9, #160 +; VBITS_EQ_256-NEXT: mov w10, #224 +; VBITS_EQ_256-NEXT: mov w11, #192 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: mov w10, #32 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x1, x9] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x1, x10] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -114,6 +190,16 @@ ; Don't use SVE for 64-bit vectors. define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: umov w8, v0.h[2] +; NO_SVE-NEXT: umov w9, v0.h[3] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: mov v0.s[1], w9 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -129,6 +215,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -139,6 +231,12 @@ } define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -153,6 +251,12 @@ } define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v32i16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -176,6 +280,26 @@ } define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -191,6 +315,38 @@ } define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #64 +; VBITS_EQ_256-NEXT: mov x9, #80 +; VBITS_EQ_256-NEXT: mov x10, #112 +; VBITS_EQ_256-NEXT: mov x11, #96 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -209,6 +365,12 @@ ; Don't use SVE for 64-bit vectors. define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -220,6 +382,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -230,6 +398,12 @@ } define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -244,6 +418,12 @@ } define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v16i32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -267,6 +447,26 @@ } define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -282,6 +482,38 @@ } define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #40 +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -300,6 +532,12 @@ ; Don't use SVE for 128-bit vectors. define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -310,6 +548,12 @@ } define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -324,6 +568,12 @@ } define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v8i64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -347,6 +597,26 @@ } define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -362,6 +632,38 @@ } define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -380,6 +682,12 @@ ; Don't use SVE for 64-bit vectors. define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -391,6 +699,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -401,6 +715,12 @@ } define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -415,6 +735,12 @@ } define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -438,6 +764,26 @@ } define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -453,6 +799,38 @@ } define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #64 +; VBITS_EQ_256-NEXT: mov x9, #80 +; VBITS_EQ_256-NEXT: mov x10, #112 +; VBITS_EQ_256-NEXT: mov x11, #96 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -471,6 +849,12 @@ ; Don't use SVE for 64-bit vectors. define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -482,6 +866,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -492,6 +882,12 @@ } define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -506,6 +902,12 @@ } define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -529,6 +931,26 @@ } define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -544,6 +966,38 @@ } define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #40 +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -562,6 +1016,12 @@ ; Don't use SVE for 128-bit vectors. define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -572,6 +1032,12 @@ } define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -586,6 +1052,12 @@ } define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -609,6 +1081,26 @@ } define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -624,6 +1116,38 @@ } define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -26,6 +27,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, v1.4h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h @@ -37,6 +43,11 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h @@ -47,6 +58,15 @@ } define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -66,6 +86,20 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v5.8h +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v6.8h +; NO_SVE-NEXT: fcmeq v1.8h, v4.8h, v1.8h +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -100,6 +134,58 @@ } define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, v7.8h +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v16.8h +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v5.8h +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.8h, v4.8h, v7.8h +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.8h, v6.8h, v17.8h +; NO_SVE-NEXT: fcmeq v2.8h, v16.8h, v7.8h +; NO_SVE-NEXT: fcmeq v3.8h, v5.8h, v18.8h +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_EQ_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h +; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -118,6 +204,110 @@ } define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v25.8h +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.8h, v5.8h, v28.8h +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.8h, v4.8h, v27.8h +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v26.8h +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.8h, v20.8h, v1.8h +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.8h, v6.8h, v29.8h +; NO_SVE-NEXT: fcmeq v2.8h, v7.8h, v30.8h +; NO_SVE-NEXT: fcmeq v3.8h, v16.8h, v31.8h +; NO_SVE-NEXT: fcmeq v4.8h, v17.8h, v8.8h +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.8h, v18.8h, v9.8h +; NO_SVE-NEXT: fcmeq v0.8h, v19.8h, v28.8h +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.8h, v21.8h, v25.8h +; NO_SVE-NEXT: fcmeq v3.8h, v22.8h, v26.8h +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.8h, v23.8h, v27.8h +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.8h, v24.8h, v10.8h +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #96 +; VBITS_EQ_256-NEXT: mov x9, #112 +; VBITS_EQ_256-NEXT: mov x10, #64 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #48 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_EQ_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h +; VBITS_EQ_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h +; VBITS_EQ_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h +; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -137,6 +327,11 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.2s, v0.2s, v1.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s @@ -148,6 +343,11 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s @@ -158,6 +358,15 @@ } define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: fcmeq v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -177,6 +386,20 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v5.4s +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v6.4s +; NO_SVE-NEXT: fcmeq v1.4s, v4.4s, v1.4s +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -211,6 +434,58 @@ } define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.4s, v5.4s, v0.4s +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v7.4s +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v16.4s +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v5.4s +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v7.4s +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v6.4s, v17.4s +; NO_SVE-NEXT: fcmeq v2.4s, v16.4s, v7.4s +; NO_SVE-NEXT: fcmeq v3.4s, v5.4s, v18.4s +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_EQ_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s +; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -229,6 +504,110 @@ } define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v25.4s +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, v28.4s +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v27.4s +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v26.4s +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.4s, v20.4s, v1.4s +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.4s, v6.4s, v29.4s +; NO_SVE-NEXT: fcmeq v2.4s, v7.4s, v30.4s +; NO_SVE-NEXT: fcmeq v3.4s, v16.4s, v31.4s +; NO_SVE-NEXT: fcmeq v4.4s, v17.4s, v8.4s +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.4s, v18.4s, v9.4s +; NO_SVE-NEXT: fcmeq v0.4s, v19.4s, v28.4s +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.4s, v21.4s, v25.4s +; NO_SVE-NEXT: fcmeq v3.4s, v22.4s, v26.4s +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.4s, v23.4s, v27.4s +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.4s, v24.4s, v10.4s +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_EQ_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s +; VBITS_EQ_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s +; VBITS_EQ_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s +; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -248,6 +627,11 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq d0, d0, d1 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq d0, d0, d1 @@ -259,6 +643,11 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, v1.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d @@ -269,6 +658,15 @@ } define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: fcmeq v1.2d, v2.2d, v3.2d +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -288,6 +686,20 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v5.2d +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v6.2d +; NO_SVE-NEXT: fcmeq v1.2d, v4.2d, v1.2d +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -322,6 +734,58 @@ } define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v5.2d, v0.2d +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, v7.2d +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v16.2d +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v5.2d +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v7.2d +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.2d, v6.2d, v17.2d +; NO_SVE-NEXT: fcmeq v2.2d, v16.2d, v7.2d +; NO_SVE-NEXT: fcmeq v3.2d, v5.2d, v18.2d +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_EQ_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d +; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -340,6 +804,110 @@ } define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v25.2d +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, v28.2d +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v27.2d +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v26.2d +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.2d, v20.2d, v1.2d +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.2d, v6.2d, v29.2d +; NO_SVE-NEXT: fcmeq v2.2d, v7.2d, v30.2d +; NO_SVE-NEXT: fcmeq v3.2d, v16.2d, v31.2d +; NO_SVE-NEXT: fcmeq v4.2d, v17.2d, v8.2d +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.2d, v18.2d, v9.2d +; NO_SVE-NEXT: fcmeq v0.2d, v19.2d, v28.2d +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.2d, v21.2d, v25.2d +; NO_SVE-NEXT: fcmeq v3.2d, v22.2d, v26.2d +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.2d, v23.2d, v27.2d +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.2d, v24.2d, v10.2d +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov x14, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_EQ_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d +; VBITS_EQ_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d +; VBITS_EQ_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d +; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -362,6 +930,21 @@ ; define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ueq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q2, q1, [x0] +; NO_SVE-NEXT: fcmgt v4.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: fcmgt v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: orr v1.16b, v2.16b, v1.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -386,6 +969,19 @@ ; define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_one_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v5.8h, v1.8h, v3.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: fcmgt v4.8h, v2.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: orr v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -410,6 +1006,17 @@ ; define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_une_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -432,6 +1039,15 @@ ; define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ogt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -454,6 +1070,17 @@ ; define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ugt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -478,6 +1105,15 @@ ; define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_olt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -500,6 +1136,17 @@ ; define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ult_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -524,6 +1171,15 @@ ; define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -546,6 +1202,17 @@ ; define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_uge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -570,6 +1237,15 @@ ; define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ole_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -592,6 +1268,17 @@ ; define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ule_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -616,6 +1303,21 @@ ; define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_uno_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q2, q1, [x0] +; NO_SVE-NEXT: fcmge v4.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: fcmgt v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: orr v1.16b, v2.16b, v1.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -638,6 +1340,19 @@ ; define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ord_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v5.8h, v1.8h, v3.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: fcmge v4.8h, v2.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: orr v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -662,6 +1377,15 @@ ; define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_eq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -684,6 +1408,17 @@ ; define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ne_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -706,6 +1441,15 @@ ; define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_gt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -728,6 +1472,91 @@ ; define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_lt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x1, #16] +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov h2, v0.h[1] +; NO_SVE-NEXT: mov h4, v0.h[3] +; NO_SVE-NEXT: mov h3, v1.h[1] +; NO_SVE-NEXT: mov h5, v1.h[3] +; NO_SVE-NEXT: mov h6, v1.h[6] +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[2] +; NO_SVE-NEXT: mov h3, v1.h[2] +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: csetm w9, lt +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[4] +; NO_SVE-NEXT: mov h3, v1.h[4] +; NO_SVE-NEXT: csetm w10, lt +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: mov h4, v0.h[5] +; NO_SVE-NEXT: mov h5, v1.h[5] +; NO_SVE-NEXT: mov h1, v1.h[7] +; NO_SVE-NEXT: csetm w11, lt +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h3, v0.h[6] +; NO_SVE-NEXT: ldr q2, [x0] +; NO_SVE-NEXT: mov h0, v0.h[7] +; NO_SVE-NEXT: csetm w12, lt +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: ldr q4, [x1] +; NO_SVE-NEXT: mov h5, v2.h[1] +; NO_SVE-NEXT: csetm w13, lt +; NO_SVE-NEXT: fcmp h6, h3 +; NO_SVE-NEXT: mov h3, v4.h[1] +; NO_SVE-NEXT: mov h6, v4.h[3] +; NO_SVE-NEXT: csetm w14, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v4.h[2] +; NO_SVE-NEXT: mov h1, v2.h[2] +; NO_SVE-NEXT: csetm w15, lt +; NO_SVE-NEXT: fcmp h5, h3 +; NO_SVE-NEXT: fmov s3, w9 +; NO_SVE-NEXT: csetm w16, lt +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[1], w8 +; NO_SVE-NEXT: csetm w17, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v2.h[3] +; NO_SVE-NEXT: mov h1, v4.h[4] +; NO_SVE-NEXT: mov v3.h[2], w10 +; NO_SVE-NEXT: fmov s5, w17 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[4] +; NO_SVE-NEXT: mov v5.h[1], w16 +; NO_SVE-NEXT: mov h6, v4.h[5] +; NO_SVE-NEXT: mov v3.h[3], w11 +; NO_SVE-NEXT: mov v5.h[2], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov h0, v2.h[5] +; NO_SVE-NEXT: mov h1, v4.h[6] +; NO_SVE-NEXT: mov v3.h[4], w12 +; NO_SVE-NEXT: mov h4, v4.h[7] +; NO_SVE-NEXT: mov v5.h[3], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[6] +; NO_SVE-NEXT: mov h2, v2.h[7] +; NO_SVE-NEXT: mov v5.h[4], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov v3.h[5], w13 +; NO_SVE-NEXT: mov v5.h[5], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[6], w14 +; NO_SVE-NEXT: mov v5.h[6], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: mov v3.h[7], w15 +; NO_SVE-NEXT: mov v5.h[7], w8 +; NO_SVE-NEXT: stp q5, q3, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -750,6 +1579,15 @@ ; define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -772,6 +1610,91 @@ ; define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_le_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x1, #16] +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov h2, v0.h[1] +; NO_SVE-NEXT: mov h4, v0.h[3] +; NO_SVE-NEXT: mov h3, v1.h[1] +; NO_SVE-NEXT: mov h5, v1.h[3] +; NO_SVE-NEXT: mov h6, v1.h[6] +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[2] +; NO_SVE-NEXT: mov h3, v1.h[2] +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: csetm w9, le +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[4] +; NO_SVE-NEXT: mov h3, v1.h[4] +; NO_SVE-NEXT: csetm w10, le +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: mov h4, v0.h[5] +; NO_SVE-NEXT: mov h5, v1.h[5] +; NO_SVE-NEXT: mov h1, v1.h[7] +; NO_SVE-NEXT: csetm w11, le +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h3, v0.h[6] +; NO_SVE-NEXT: ldr q2, [x0] +; NO_SVE-NEXT: mov h0, v0.h[7] +; NO_SVE-NEXT: csetm w12, le +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: ldr q4, [x1] +; NO_SVE-NEXT: mov h5, v2.h[1] +; NO_SVE-NEXT: csetm w13, le +; NO_SVE-NEXT: fcmp h6, h3 +; NO_SVE-NEXT: mov h3, v4.h[1] +; NO_SVE-NEXT: mov h6, v4.h[3] +; NO_SVE-NEXT: csetm w14, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v4.h[2] +; NO_SVE-NEXT: mov h1, v2.h[2] +; NO_SVE-NEXT: csetm w15, le +; NO_SVE-NEXT: fcmp h5, h3 +; NO_SVE-NEXT: fmov s3, w9 +; NO_SVE-NEXT: csetm w16, le +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[1], w8 +; NO_SVE-NEXT: csetm w17, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v2.h[3] +; NO_SVE-NEXT: mov h1, v4.h[4] +; NO_SVE-NEXT: mov v3.h[2], w10 +; NO_SVE-NEXT: fmov s5, w17 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[4] +; NO_SVE-NEXT: mov v5.h[1], w16 +; NO_SVE-NEXT: mov h6, v4.h[5] +; NO_SVE-NEXT: mov v3.h[3], w11 +; NO_SVE-NEXT: mov v5.h[2], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov h0, v2.h[5] +; NO_SVE-NEXT: mov h1, v4.h[6] +; NO_SVE-NEXT: mov v3.h[4], w12 +; NO_SVE-NEXT: mov h4, v4.h[7] +; NO_SVE-NEXT: mov v5.h[3], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[6] +; NO_SVE-NEXT: mov h2, v2.h[7] +; NO_SVE-NEXT: mov v5.h[4], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov v3.h[5], w13 +; NO_SVE-NEXT: mov v5.h[5], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[6], w14 +; NO_SVE-NEXT: mov v5.h[6], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: mov v3.h[7], w15 +; NO_SVE-NEXT: mov v5.h[7], w8 +; NO_SVE-NEXT: stp q5, q3, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -26,6 +27,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f16_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: str d0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f16_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] @@ -40,6 +48,13 @@ ; Don't use SVE for 128-bit vectors. define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f16_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -53,6 +68,14 @@ } define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v8f16_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: fcvtl2 v1.4s, v0.8h +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v8f16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -68,6 +91,17 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: fcvt_v16f16_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcvtl2 v3.4s, v1.8h +; NO_SVE-NEXT: fcvtl v1.4s, v1.4h +; NO_SVE-NEXT: fcvtl2 v2.4s, v0.8h +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: stp q1, q3, [x1] +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -95,6 +129,44 @@ } define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f16_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: fcvtl2 v4.4s, v1.8h +; NO_SVE-NEXT: fcvtl v1.4s, v1.4h +; NO_SVE-NEXT: ldp q5, q3, [x0] +; NO_SVE-NEXT: fcvtl2 v2.4s, v0.8h +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: stp q1, q4, [x1, #64] +; NO_SVE-NEXT: stp q0, q2, [x1, #96] +; NO_SVE-NEXT: fcvtl2 v0.4s, v5.8h +; NO_SVE-NEXT: fcvtl2 v6.4s, v3.8h +; NO_SVE-NEXT: fcvtl v2.4s, v3.4h +; NO_SVE-NEXT: fcvtl v3.4s, v5.4h +; NO_SVE-NEXT: stp q2, q6, [x1, #32] +; NO_SVE-NEXT: stp q3, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f16_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_EQ_256-NEXT: fcvt z2.s, p0/m, z2.h +; VBITS_EQ_256-NEXT: fcvt z3.s, p0/m, z3.h +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -109,6 +181,74 @@ } define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v64f16_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: fcvtl2 v7.4s, v1.8h +; NO_SVE-NEXT: fcvtl v1.4s, v1.4h +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: fcvtl2 v4.4s, v0.8h +; NO_SVE-NEXT: fcvtl v0.4s, v0.4h +; NO_SVE-NEXT: ldp q6, q5, [x0, #32] +; NO_SVE-NEXT: fcvtl2 v18.4s, v2.8h +; NO_SVE-NEXT: fcvtl v2.4s, v2.4h +; NO_SVE-NEXT: ldp q17, q16, [x0] +; NO_SVE-NEXT: stp q2, q18, [x1, #160] +; NO_SVE-NEXT: fcvtl2 v2.4s, v6.8h +; NO_SVE-NEXT: stp q1, q7, [x1, #192] +; NO_SVE-NEXT: fcvtl2 v1.4s, v5.8h +; NO_SVE-NEXT: stp q0, q4, [x1, #224] +; NO_SVE-NEXT: fcvtl2 v0.4s, v3.8h +; NO_SVE-NEXT: fcvtl v3.4s, v3.4h +; NO_SVE-NEXT: fcvtl v4.4s, v5.4h +; NO_SVE-NEXT: stp q4, q1, [x1, #96] +; NO_SVE-NEXT: fcvtl v1.4s, v16.4h +; NO_SVE-NEXT: stp q3, q0, [x1, #128] +; NO_SVE-NEXT: fcvtl v0.4s, v6.4h +; NO_SVE-NEXT: fcvtl2 v3.4s, v16.8h +; NO_SVE-NEXT: fcvtl2 v4.4s, v17.8h +; NO_SVE-NEXT: stp q0, q2, [x1, #64] +; NO_SVE-NEXT: fcvtl v2.4s, v17.4h +; NO_SVE-NEXT: stp q1, q3, [x1, #32] +; NO_SVE-NEXT: stp q2, q4, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v64f16_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #56 +; VBITS_EQ_256-NEXT: mov x14, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.s }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.s }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.s }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_EQ_256-NEXT: fcvt z2.s, p0/m, z2.h +; VBITS_EQ_256-NEXT: fcvt z3.s, p0/m, z3.h +; VBITS_EQ_256-NEXT: fcvt z4.s, p0/m, z4.h +; VBITS_EQ_256-NEXT: fcvt z5.s, p0/m, z5.h +; VBITS_EQ_256-NEXT: fcvt z6.s, p0/m, z6.h +; VBITS_EQ_256-NEXT: fcvt z7.s, p0/m, z7.h +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -128,6 +268,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v1f16_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: str d0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v1f16_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] @@ -142,6 +289,16 @@ ; v2f16 is not legal for NEON, so use SVE define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f16_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: mov h1, v0.h[1] +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f16_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] @@ -158,6 +315,21 @@ } define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f16_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: mov h1, v0.h[3] +; NO_SVE-NEXT: mov h2, v0.h[2] +; NO_SVE-NEXT: mov h3, v0.h[1] +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: fcvt d2, h2 +; NO_SVE-NEXT: fcvt d3, h3 +; NO_SVE-NEXT: mov v2.d[1], v1.d[0] +; NO_SVE-NEXT: mov v0.d[1], v3.d[0] +; NO_SVE-NEXT: stp q0, q2, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -172,6 +344,32 @@ } define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v8f16_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: mov h2, v0.h[3] +; NO_SVE-NEXT: mov h3, v0.h[2] +; NO_SVE-NEXT: mov h4, v0.h[1] +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: mov h5, v1.h[3] +; NO_SVE-NEXT: mov h6, v1.h[2] +; NO_SVE-NEXT: mov h7, v1.h[1] +; NO_SVE-NEXT: fcvt d2, h2 +; NO_SVE-NEXT: fcvt d3, h3 +; NO_SVE-NEXT: fcvt d4, h4 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: fcvt d5, h5 +; NO_SVE-NEXT: fcvt d6, h6 +; NO_SVE-NEXT: fcvt d7, h7 +; NO_SVE-NEXT: mov v3.d[1], v2.d[0] +; NO_SVE-NEXT: mov v0.d[1], v4.d[0] +; NO_SVE-NEXT: mov v6.d[1], v5.d[0] +; NO_SVE-NEXT: mov v1.d[1], v7.d[0] +; NO_SVE-NEXT: stp q0, q3, [x1] +; NO_SVE-NEXT: stp q1, q6, [x1, #32] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -199,6 +397,73 @@ } define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v16f16_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x0] +; NO_SVE-NEXT: mov h4, v2.h[1] +; NO_SVE-NEXT: mov h16, v2.h[3] +; NO_SVE-NEXT: mov h17, v2.h[2] +; NO_SVE-NEXT: mov h5, v1.h[1] +; NO_SVE-NEXT: mov h6, v1.h[3] +; NO_SVE-NEXT: mov h7, v1.h[2] +; NO_SVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: ext v0.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: fcvt d5, h5 +; NO_SVE-NEXT: fcvt d6, h6 +; NO_SVE-NEXT: fcvt d7, h7 +; NO_SVE-NEXT: fcvt d2, h2 +; NO_SVE-NEXT: fcvt d4, h4 +; NO_SVE-NEXT: fcvt d16, h16 +; NO_SVE-NEXT: fcvt d17, h17 +; NO_SVE-NEXT: mov v1.d[1], v5.d[0] +; NO_SVE-NEXT: mov h5, v3.h[1] +; NO_SVE-NEXT: mov v7.d[1], v6.d[0] +; NO_SVE-NEXT: mov v2.d[1], v4.d[0] +; NO_SVE-NEXT: mov h4, v0.h[3] +; NO_SVE-NEXT: mov h6, v3.h[3] +; NO_SVE-NEXT: mov h18, v3.h[2] +; NO_SVE-NEXT: mov v17.d[1], v16.d[0] +; NO_SVE-NEXT: mov h16, v0.h[2] +; NO_SVE-NEXT: stp q1, q7, [x1, #64] +; NO_SVE-NEXT: mov h1, v0.h[1] +; NO_SVE-NEXT: fcvt d3, h3 +; NO_SVE-NEXT: fcvt d5, h5 +; NO_SVE-NEXT: fcvt d6, h6 +; NO_SVE-NEXT: fcvt d7, h18 +; NO_SVE-NEXT: fcvt d4, h4 +; NO_SVE-NEXT: fcvt d16, h16 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: mov v3.d[1], v5.d[0] +; NO_SVE-NEXT: stp q2, q17, [x1] +; NO_SVE-NEXT: mov v7.d[1], v6.d[0] +; NO_SVE-NEXT: mov v16.d[1], v4.d[0] +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: stp q3, q7, [x1, #96] +; NO_SVE-NEXT: stp q0, q16, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_EQ_256-NEXT: fcvt z2.d, p0/m, z2.h +; VBITS_EQ_256-NEXT: fcvt z3.d, p0/m, z3.h +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -213,6 +478,132 @@ } define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f16_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q4, [x0, #32] +; NO_SVE-NEXT: mov h18, v1.h[1] +; NO_SVE-NEXT: fcvt d19, h1 +; NO_SVE-NEXT: mov h16, v1.h[3] +; NO_SVE-NEXT: mov h17, v1.h[2] +; NO_SVE-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; NO_SVE-NEXT: mov h20, v4.h[3] +; NO_SVE-NEXT: mov h21, v4.h[2] +; NO_SVE-NEXT: mov h22, v4.h[1] +; NO_SVE-NEXT: fcvt d4, h4 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: mov h23, v5.h[1] +; NO_SVE-NEXT: fcvt d18, h18 +; NO_SVE-NEXT: fcvt d20, h20 +; NO_SVE-NEXT: fcvt d21, h21 +; NO_SVE-NEXT: fcvt d22, h22 +; NO_SVE-NEXT: fcvt d16, h16 +; NO_SVE-NEXT: fcvt d17, h17 +; NO_SVE-NEXT: mov h24, v2.h[2] +; NO_SVE-NEXT: mov v19.d[1], v18.d[0] +; NO_SVE-NEXT: fcvt d18, h23 +; NO_SVE-NEXT: mov h6, v3.h[1] +; NO_SVE-NEXT: mov v21.d[1], v20.d[0] +; NO_SVE-NEXT: mov h23, v3.h[2] +; NO_SVE-NEXT: mov v4.d[1], v22.d[0] +; NO_SVE-NEXT: mov h22, v3.h[3] +; NO_SVE-NEXT: fcvt d7, h3 +; NO_SVE-NEXT: fcvt d6, h6 +; NO_SVE-NEXT: mov h20, v2.h[3] +; NO_SVE-NEXT: mov h25, v2.h[1] +; NO_SVE-NEXT: stp q4, q21, [x1, #192] +; NO_SVE-NEXT: fcvt d4, h5 +; NO_SVE-NEXT: fcvt d21, h22 +; NO_SVE-NEXT: fcvt d22, h23 +; NO_SVE-NEXT: ext v0.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: fcvt d23, h24 +; NO_SVE-NEXT: mov v7.d[1], v6.d[0] +; NO_SVE-NEXT: mov v17.d[1], v16.d[0] +; NO_SVE-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; NO_SVE-NEXT: fcvt d20, h20 +; NO_SVE-NEXT: fcvt d24, h25 +; NO_SVE-NEXT: fcvt d2, h2 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: mov h6, v0.h[3] +; NO_SVE-NEXT: mov v22.d[1], v21.d[0] +; NO_SVE-NEXT: stp q19, q17, [x1, #128] +; NO_SVE-NEXT: mov h16, v0.h[1] +; NO_SVE-NEXT: mov v4.d[1], v18.d[0] +; NO_SVE-NEXT: mov h17, v0.h[2] +; NO_SVE-NEXT: mov v23.d[1], v20.d[0] +; NO_SVE-NEXT: mov v2.d[1], v24.d[0] +; NO_SVE-NEXT: stp q7, q22, [x1, #64] +; NO_SVE-NEXT: mov h7, v3.h[3] +; NO_SVE-NEXT: mov h18, v3.h[2] +; NO_SVE-NEXT: mov h19, v3.h[1] +; NO_SVE-NEXT: mov h20, v5.h[3] +; NO_SVE-NEXT: mov h5, v5.h[2] +; NO_SVE-NEXT: mov h21, v1.h[1] +; NO_SVE-NEXT: mov h22, v1.h[3] +; NO_SVE-NEXT: mov h24, v1.h[2] +; NO_SVE-NEXT: fcvt d6, h6 +; NO_SVE-NEXT: fcvt d16, h16 +; NO_SVE-NEXT: fcvt d7, h7 +; NO_SVE-NEXT: fcvt d18, h18 +; NO_SVE-NEXT: fcvt d19, h19 +; NO_SVE-NEXT: fcvt d20, h20 +; NO_SVE-NEXT: fcvt d5, h5 +; NO_SVE-NEXT: fcvt d21, h21 +; NO_SVE-NEXT: fcvt d1, h1 +; NO_SVE-NEXT: fcvt d22, h22 +; NO_SVE-NEXT: fcvt d24, h24 +; NO_SVE-NEXT: fcvt d3, h3 +; NO_SVE-NEXT: stp q2, q23, [x1] +; NO_SVE-NEXT: fcvt d0, h0 +; NO_SVE-NEXT: fcvt d2, h17 +; NO_SVE-NEXT: mov v5.d[1], v20.d[0] +; NO_SVE-NEXT: mov v1.d[1], v21.d[0] +; NO_SVE-NEXT: mov v24.d[1], v22.d[0] +; NO_SVE-NEXT: mov v3.d[1], v19.d[0] +; NO_SVE-NEXT: mov v18.d[1], v7.d[0] +; NO_SVE-NEXT: stp q4, q5, [x1, #224] +; NO_SVE-NEXT: mov v0.d[1], v16.d[0] +; NO_SVE-NEXT: mov v2.d[1], v6.d[0] +; NO_SVE-NEXT: stp q1, q24, [x1, #160] +; NO_SVE-NEXT: stp q3, q18, [x1, #96] +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f16_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.d }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_EQ_256-NEXT: fcvt z2.d, p0/m, z2.h +; VBITS_EQ_256-NEXT: fcvt z3.d, p0/m, z3.h +; VBITS_EQ_256-NEXT: fcvt z4.d, p0/m, z4.h +; VBITS_EQ_256-NEXT: fcvt z5.d, p0/m, z5.h +; VBITS_EQ_256-NEXT: fcvt z6.d, p0/m, z6.h +; VBITS_EQ_256-NEXT: fcvt z7.d, p0/m, z7.h +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -232,6 +623,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v1f32_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: str d0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v1f32_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] @@ -246,6 +644,13 @@ ; Don't use SVE for 128-bit vectors. define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f32_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -259,6 +664,14 @@ } define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f32_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: fcvtl2 v1.2d, v0.4s +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -274,6 +687,17 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: fcvt_v8f32_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcvtl2 v3.2d, v1.4s +; NO_SVE-NEXT: fcvtl v1.2d, v1.2s +; NO_SVE-NEXT: fcvtl2 v2.2d, v0.4s +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: stp q1, q3, [x1] +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -300,6 +724,44 @@ } define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v16f32_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: fcvtl2 v4.2d, v1.4s +; NO_SVE-NEXT: fcvtl v1.2d, v1.2s +; NO_SVE-NEXT: ldp q5, q3, [x0] +; NO_SVE-NEXT: fcvtl2 v2.2d, v0.4s +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: stp q1, q4, [x1, #64] +; NO_SVE-NEXT: stp q0, q2, [x1, #96] +; NO_SVE-NEXT: fcvtl2 v0.2d, v5.4s +; NO_SVE-NEXT: fcvtl2 v6.2d, v3.4s +; NO_SVE-NEXT: fcvtl v2.2d, v3.2s +; NO_SVE-NEXT: fcvtl v3.2d, v5.2s +; NO_SVE-NEXT: stp q2, q6, [x1, #32] +; NO_SVE-NEXT: stp q3, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v16f32_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_EQ_256-NEXT: fcvt z2.d, p0/m, z2.s +; VBITS_EQ_256-NEXT: fcvt z3.d, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -314,6 +776,74 @@ } define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f32_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: fcvtl2 v7.2d, v1.4s +; NO_SVE-NEXT: fcvtl v1.2d, v1.2s +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: fcvtl2 v4.2d, v0.4s +; NO_SVE-NEXT: fcvtl v0.2d, v0.2s +; NO_SVE-NEXT: ldp q6, q5, [x0, #32] +; NO_SVE-NEXT: fcvtl2 v18.2d, v2.4s +; NO_SVE-NEXT: fcvtl v2.2d, v2.2s +; NO_SVE-NEXT: ldp q17, q16, [x0] +; NO_SVE-NEXT: stp q2, q18, [x1, #160] +; NO_SVE-NEXT: fcvtl2 v2.2d, v6.4s +; NO_SVE-NEXT: stp q1, q7, [x1, #192] +; NO_SVE-NEXT: fcvtl2 v1.2d, v5.4s +; NO_SVE-NEXT: stp q0, q4, [x1, #224] +; NO_SVE-NEXT: fcvtl2 v0.2d, v3.4s +; NO_SVE-NEXT: fcvtl v3.2d, v3.2s +; NO_SVE-NEXT: fcvtl v4.2d, v5.2s +; NO_SVE-NEXT: stp q4, q1, [x1, #96] +; NO_SVE-NEXT: fcvtl v1.2d, v16.2s +; NO_SVE-NEXT: stp q3, q0, [x1, #128] +; NO_SVE-NEXT: fcvtl v0.2d, v6.2s +; NO_SVE-NEXT: fcvtl2 v3.2d, v16.4s +; NO_SVE-NEXT: fcvtl2 v4.2d, v17.4s +; NO_SVE-NEXT: stp q0, q2, [x1, #64] +; NO_SVE-NEXT: fcvtl v2.2d, v17.2s +; NO_SVE-NEXT: stp q1, q3, [x1, #32] +; NO_SVE-NEXT: stp q2, q4, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f32_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.d }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_EQ_256-NEXT: fcvt z2.d, p0/m, z2.s +; VBITS_EQ_256-NEXT: fcvt z3.d, p0/m, z3.s +; VBITS_EQ_256-NEXT: fcvt z4.d, p0/m, z4.s +; VBITS_EQ_256-NEXT: fcvt z5.d, p0/m, z5.s +; VBITS_EQ_256-NEXT: fcvt z6.d, p0/m, z6.s +; VBITS_EQ_256-NEXT: fcvt z7.d, p0/m, z7.s +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -333,6 +863,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f32_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: str s0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f32_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -347,6 +884,13 @@ ; Don't use SVE for 128-bit vectors. define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f32_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: str d0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f32_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -360,6 +904,15 @@ } define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v8f32_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v8f32_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -375,6 +928,19 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcvt_v16f32_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v16f32_v16f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -401,6 +967,48 @@ } define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f32_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #96] +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: stp q4, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f32_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s +; VBITS_EQ_256-NEXT: fcvt z2.h, p0/m, z2.s +; VBITS_EQ_256-NEXT: fcvt z3.h, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.s }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -415,6 +1023,88 @@ } define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v64f32_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #192] +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: ldp q16, q17, [x0, #64] +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: fcvtn v16.4h, v16.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: ldp q18, q19, [x0, #224] +; NO_SVE-NEXT: fcvtn v17.4h, v17.4s +; NO_SVE-NEXT: fcvtn v18.4h, v18.4s +; NO_SVE-NEXT: mov v16.d[1], v17.d[0] +; NO_SVE-NEXT: ldp q20, q21, [x0, #128] +; NO_SVE-NEXT: fcvtn v19.4h, v19.4s +; NO_SVE-NEXT: fcvtn v20.4h, v20.4s +; NO_SVE-NEXT: mov v18.d[1], v19.d[0] +; NO_SVE-NEXT: ldp q22, q23, [x0, #160] +; NO_SVE-NEXT: fcvtn v21.4h, v21.4s +; NO_SVE-NEXT: stp q4, q2, [x1] +; NO_SVE-NEXT: stp q16, q6, [x1, #32] +; NO_SVE-NEXT: stp q0, q18, [x1, #96] +; NO_SVE-NEXT: fcvtn v22.4h, v22.4s +; NO_SVE-NEXT: mov v20.d[1], v21.d[0] +; NO_SVE-NEXT: fcvtn v23.4h, v23.4s +; NO_SVE-NEXT: mov v22.d[1], v23.d[0] +; NO_SVE-NEXT: stp q20, q22, [x1, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v64f32_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: mov x10, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: mov x13, #40 +; VBITS_EQ_256-NEXT: mov x14, #32 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z5 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z5.s +; VBITS_EQ_256-NEXT: movprfx z1, z4 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z4.s +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z6 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z6.s +; VBITS_EQ_256-NEXT: movprfx z1, z3 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z2 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z2.s +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z7.s +; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -434,6 +1124,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v1f64_v1f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: str h0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v1f64_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -448,6 +1145,16 @@ ; v2f16 is not legal for NEON, so use SVE define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f64_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: mov d1, v0.d[1] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: str s0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f64_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -464,6 +1171,21 @@ } define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f64_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: mov d1, v0.d[1] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: fcvt h1, d2 +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: mov v0.h[2], v1.h[0] +; NO_SVE-NEXT: fcvt h1, d2 +; NO_SVE-NEXT: mov v0.h[3], v1.h[0] +; NO_SVE-NEXT: str d0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f64_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -479,6 +1201,33 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcvt_v8f64_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: mov d1, v0.d[1] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: fcvt h3, d2 +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: fcvt h2, d2 +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: ldr q1, [x0, #32] +; NO_SVE-NEXT: mov v0.h[2], v3.h[0] +; NO_SVE-NEXT: fcvt h3, d1 +; NO_SVE-NEXT: mov d1, v1.d[1] +; NO_SVE-NEXT: mov v0.h[3], v2.h[0] +; NO_SVE-NEXT: ldr q2, [x0, #48] +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: mov v0.h[4], v3.h[0] +; NO_SVE-NEXT: mov v0.h[5], v1.h[0] +; NO_SVE-NEXT: fcvt h1, d2 +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: mov v0.h[6], v1.h[0] +; NO_SVE-NEXT: fcvt h1, d2 +; NO_SVE-NEXT: mov v0.h[7], v1.h[0] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -510,6 +1259,75 @@ } define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v16f64_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x0] +; NO_SVE-NEXT: mov d2, v0.d[1] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: ldp q1, q3, [x0, #64] +; NO_SVE-NEXT: fcvt h7, d5 +; NO_SVE-NEXT: mov d5, v5.d[1] +; NO_SVE-NEXT: fcvt h2, d2 +; NO_SVE-NEXT: mov d4, v1.d[1] +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: fcvt h16, d3 +; NO_SVE-NEXT: mov d3, v3.d[1] +; NO_SVE-NEXT: mov v0.h[1], v2.h[0] +; NO_SVE-NEXT: ldr q6, [x0, #96] +; NO_SVE-NEXT: fcvt h4, d4 +; NO_SVE-NEXT: ldr q2, [x0, #32] +; NO_SVE-NEXT: fcvt h3, d3 +; NO_SVE-NEXT: mov v0.h[2], v7.h[0] +; NO_SVE-NEXT: fcvt h7, d6 +; NO_SVE-NEXT: mov v1.h[1], v4.h[0] +; NO_SVE-NEXT: fcvt h4, d5 +; NO_SVE-NEXT: fcvt h5, d2 +; NO_SVE-NEXT: mov d6, v6.d[1] +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: mov v1.h[2], v16.h[0] +; NO_SVE-NEXT: mov v0.h[3], v4.h[0] +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: fcvt h6, d6 +; NO_SVE-NEXT: fcvt h2, d2 +; NO_SVE-NEXT: mov v1.h[3], v3.h[0] +; NO_SVE-NEXT: ldr q3, [x0, #112] +; NO_SVE-NEXT: mov v0.h[4], v5.h[0] +; NO_SVE-NEXT: fcvt h5, d4 +; NO_SVE-NEXT: mov d4, v4.d[1] +; NO_SVE-NEXT: mov v1.h[4], v7.h[0] +; NO_SVE-NEXT: fcvt h7, d3 +; NO_SVE-NEXT: mov d3, v3.d[1] +; NO_SVE-NEXT: mov v0.h[5], v2.h[0] +; NO_SVE-NEXT: fcvt h2, d4 +; NO_SVE-NEXT: mov v1.h[5], v6.h[0] +; NO_SVE-NEXT: fcvt h3, d3 +; NO_SVE-NEXT: mov v0.h[6], v5.h[0] +; NO_SVE-NEXT: mov v1.h[6], v7.h[0] +; NO_SVE-NEXT: mov v0.h[7], v2.h[0] +; NO_SVE-NEXT: mov v1.h[7], v3.h[0] +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v16f64_v16f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: fcvt z2.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: fcvt z3.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.d }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -524,6 +1342,142 @@ } define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f64_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x0] +; NO_SVE-NEXT: mov d1, v0.d[1] +; NO_SVE-NEXT: fcvt h0, d0 +; NO_SVE-NEXT: ldp q16, q19, [x0, #64] +; NO_SVE-NEXT: fcvt h21, d5 +; NO_SVE-NEXT: mov d5, v5.d[1] +; NO_SVE-NEXT: fcvt h1, d1 +; NO_SVE-NEXT: mov d18, v16.d[1] +; NO_SVE-NEXT: fcvt h5, d5 +; NO_SVE-NEXT: ldp q17, q7, [x0, #192] +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: fcvt h1, d16 +; NO_SVE-NEXT: fcvt h18, d18 +; NO_SVE-NEXT: fcvt h16, d19 +; NO_SVE-NEXT: mov d19, v19.d[1] +; NO_SVE-NEXT: mov d20, v17.d[1] +; NO_SVE-NEXT: fcvt h17, d17 +; NO_SVE-NEXT: mov v0.h[2], v21.h[0] +; NO_SVE-NEXT: ldp q6, q2, [x0, #128] +; NO_SVE-NEXT: mov v1.h[1], v18.h[0] +; NO_SVE-NEXT: fcvt h23, d7 +; NO_SVE-NEXT: fcvt h20, d20 +; NO_SVE-NEXT: mov d7, v7.d[1] +; NO_SVE-NEXT: fcvt h19, d19 +; NO_SVE-NEXT: mov d22, v6.d[1] +; NO_SVE-NEXT: fcvt h6, d6 +; NO_SVE-NEXT: mov v1.h[2], v16.h[0] +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: fcvt h16, d2 +; NO_SVE-NEXT: mov v17.h[1], v20.h[0] +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: fcvt h22, d22 +; NO_SVE-NEXT: ldr q3, [x0, #96] +; NO_SVE-NEXT: mov v0.h[3], v5.h[0] +; NO_SVE-NEXT: fcvt h20, d4 +; NO_SVE-NEXT: mov d4, v4.d[1] +; NO_SVE-NEXT: ldr q5, [x0, #224] +; NO_SVE-NEXT: mov v17.h[2], v23.h[0] +; NO_SVE-NEXT: fcvt h7, d7 +; NO_SVE-NEXT: mov v6.h[1], v22.h[0] +; NO_SVE-NEXT: mov v0.h[4], v20.h[0] +; NO_SVE-NEXT: fcvt h4, d4 +; NO_SVE-NEXT: fcvt h2, d2 +; NO_SVE-NEXT: fcvt h18, d3 +; NO_SVE-NEXT: mov v6.h[2], v16.h[0] +; NO_SVE-NEXT: fcvt h16, d5 +; NO_SVE-NEXT: mov v17.h[3], v7.h[0] +; NO_SVE-NEXT: ldr q7, [x0, #160] +; NO_SVE-NEXT: mov v1.h[3], v19.h[0] +; NO_SVE-NEXT: mov d3, v3.d[1] +; NO_SVE-NEXT: mov d5, v5.d[1] +; NO_SVE-NEXT: mov v0.h[5], v4.h[0] +; NO_SVE-NEXT: fcvt h4, d7 +; NO_SVE-NEXT: mov v6.h[3], v2.h[0] +; NO_SVE-NEXT: ldr q2, [x0, #48] +; NO_SVE-NEXT: mov d7, v7.d[1] +; NO_SVE-NEXT: mov v1.h[4], v18.h[0] +; NO_SVE-NEXT: fcvt h3, d3 +; NO_SVE-NEXT: mov v17.h[4], v16.h[0] +; NO_SVE-NEXT: ldr q16, [x0, #176] +; NO_SVE-NEXT: fcvt h5, d5 +; NO_SVE-NEXT: ldr q18, [x0, #240] +; NO_SVE-NEXT: mov v6.h[4], v4.h[0] +; NO_SVE-NEXT: fcvt h7, d7 +; NO_SVE-NEXT: ldr q4, [x0, #112] +; NO_SVE-NEXT: fcvt h19, d2 +; NO_SVE-NEXT: mov v1.h[5], v3.h[0] +; NO_SVE-NEXT: fcvt h3, d18 +; NO_SVE-NEXT: mov v17.h[5], v5.h[0] +; NO_SVE-NEXT: mov v6.h[5], v7.h[0] +; NO_SVE-NEXT: fcvt h5, d4 +; NO_SVE-NEXT: fcvt h7, d16 +; NO_SVE-NEXT: mov d4, v4.d[1] +; NO_SVE-NEXT: mov d16, v16.d[1] +; NO_SVE-NEXT: mov d2, v2.d[1] +; NO_SVE-NEXT: mov d18, v18.d[1] +; NO_SVE-NEXT: mov v0.h[6], v19.h[0] +; NO_SVE-NEXT: mov v17.h[6], v3.h[0] +; NO_SVE-NEXT: mov v6.h[6], v7.h[0] +; NO_SVE-NEXT: fcvt h3, d16 +; NO_SVE-NEXT: fcvt h7, d18 +; NO_SVE-NEXT: fcvt h4, d4 +; NO_SVE-NEXT: fcvt h2, d2 +; NO_SVE-NEXT: mov v1.h[6], v5.h[0] +; NO_SVE-NEXT: mov v6.h[7], v3.h[0] +; NO_SVE-NEXT: mov v17.h[7], v7.h[0] +; NO_SVE-NEXT: mov v0.h[7], v2.h[0] +; NO_SVE-NEXT: mov v1.h[7], v4.h[0] +; NO_SVE-NEXT: stp q6, q17, [x1, #32] +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f64_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x11, #12 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x13, #20 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z5 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z5.d +; VBITS_EQ_256-NEXT: movprfx z1, z4 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z4.d +; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z6 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z6.d +; VBITS_EQ_256-NEXT: movprfx z1, z3 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: movprfx z0, z2 +; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z7.d +; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -543,6 +1497,13 @@ ; Don't use SVE for 64-bit vectors. define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v1f64_v1f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: str s0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v1f64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -556,6 +1517,12 @@ ; Don't use SVE for 128-bit vectors. define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v2f64_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v2f64_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtn v0.2s, v0.2d @@ -567,6 +1534,14 @@ } define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v4f64_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v1.2d +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcvt_v4f64_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -582,6 +1557,17 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcvt_v8f64_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ldp q1, q3, [x0] +; NO_SVE-NEXT: fcvtn2 v0.4s, v2.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v3.2d +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -608,6 +1594,44 @@ } define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v16f64_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q4, [x0, #64] +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ldp q1, q5, [x0, #96] +; NO_SVE-NEXT: fcvtn2 v0.4s, v4.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ldp q2, q6, [x0] +; NO_SVE-NEXT: fcvtn2 v1.4s, v5.2d +; NO_SVE-NEXT: fcvtn v2.2s, v2.2d +; NO_SVE-NEXT: ldp q3, q7, [x0, #32] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: fcvtn2 v2.4s, v6.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v7.2d +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v16f64_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d +; VBITS_EQ_256-NEXT: fcvt z2.s, p0/m, z2.d +; VBITS_EQ_256-NEXT: fcvt z3.s, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.d }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -622,6 +1646,80 @@ } define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: fcvt_v32f64_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #192] +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: ldp q5, q4, [x0, #128] +; NO_SVE-NEXT: fcvtn2 v3.4s, v2.2d +; NO_SVE-NEXT: fcvtn v5.2s, v5.2d +; NO_SVE-NEXT: ldp q7, q6, [x0, #64] +; NO_SVE-NEXT: fcvtn2 v5.4s, v4.2d +; NO_SVE-NEXT: fcvtn v7.2s, v7.2d +; NO_SVE-NEXT: ldp q17, q16, [x0, #160] +; NO_SVE-NEXT: fcvtn2 v7.4s, v6.2d +; NO_SVE-NEXT: fcvtn v17.2s, v17.2d +; NO_SVE-NEXT: ldp q19, q18, [x0, #96] +; NO_SVE-NEXT: fcvtn2 v17.4s, v16.2d +; NO_SVE-NEXT: fcvtn v19.2s, v19.2d +; NO_SVE-NEXT: ldp q21, q20, [x0] +; NO_SVE-NEXT: fcvtn2 v19.4s, v18.2d +; NO_SVE-NEXT: fcvtn v2.2s, v21.2d +; NO_SVE-NEXT: ldp q0, q22, [x0, #32] +; NO_SVE-NEXT: stp q7, q19, [x1, #32] +; NO_SVE-NEXT: fcvtn2 v2.4s, v20.2d +; NO_SVE-NEXT: stp q5, q17, [x1, #64] +; NO_SVE-NEXT: stp q1, q3, [x1, #96] +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v22.2d +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcvt_v32f64_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x11, #12 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x13, #20 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: movprfx z0, z5 +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z5.d +; VBITS_EQ_256-NEXT: movprfx z1, z4 +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z4.d +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: movprfx z0, z6 +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z6.d +; VBITS_EQ_256-NEXT: movprfx z1, z3 +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: movprfx z0, z2 +; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z2.d +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z7.d +; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -144,39 +144,39 @@ ; NO_SVE-LABEL: select_v64f16: ; NO_SVE: // %bb.0: ; NO_SVE-NEXT: tst w2, #0x1 -; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ldr q0, [x0, #96] ; NO_SVE-NEXT: csetm w8, ne -; NO_SVE-NEXT: ldr q1, [x0] -; NO_SVE-NEXT: ldr q2, [x0, #48] -; NO_SVE-NEXT: ldr q3, [x0, #32] -; NO_SVE-NEXT: ldr q4, [x0, #80] +; NO_SVE-NEXT: ldr q1, [x0, #112] +; NO_SVE-NEXT: ldr q2, [x0, #64] +; NO_SVE-NEXT: ldr q3, [x0, #80] +; NO_SVE-NEXT: ldr q4, [x0, #32] ; NO_SVE-NEXT: dup v21.8h, w8 -; NO_SVE-NEXT: ldr q5, [x0, #64] -; NO_SVE-NEXT: ldr q6, [x0, #112] -; NO_SVE-NEXT: ldr q7, [x0, #96] -; NO_SVE-NEXT: ldr q16, [x1, #16] -; NO_SVE-NEXT: ldr q17, [x1] -; NO_SVE-NEXT: ldr q18, [x1, #48] -; NO_SVE-NEXT: ldr q19, [x1, #32] +; NO_SVE-NEXT: ldr q5, [x0, #48] +; NO_SVE-NEXT: ldr q6, [x0] +; NO_SVE-NEXT: ldr q7, [x0, #16] +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q18, [x1, #64] +; NO_SVE-NEXT: ldr q19, [x1, #80] ; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b -; NO_SVE-NEXT: ldr q20, [x1, #80] +; NO_SVE-NEXT: ldr q20, [x1, #32] ; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b -; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: ldr q16, [x1, #48] ; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b -; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q17, [x1] ; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b -; NO_SVE-NEXT: ldr q18, [x1, #96] +; NO_SVE-NEXT: ldr q18, [x1, #16] ; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b -; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q0, q1, [x0, #96] ; NO_SVE-NEXT: mov v0.16b, v21.16b ; NO_SVE-NEXT: mov v1.16b, v21.16b -; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: stp q2, q3, [x0, #64] ; NO_SVE-NEXT: mov v2.16b, v21.16b ; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b ; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b ; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b -; NO_SVE-NEXT: stp q0, q4, [x0, #64] -; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: stp q4, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] ; NO_SVE-NEXT: ret ; ; VBITS_GE_1024-LABEL: select_v64f16: @@ -426,39 +426,39 @@ ; NO_SVE-LABEL: select_v32f32: ; NO_SVE: // %bb.0: ; NO_SVE-NEXT: tst w2, #0x1 -; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ldr q0, [x0, #96] ; NO_SVE-NEXT: csetm w8, ne -; NO_SVE-NEXT: ldr q1, [x0] -; NO_SVE-NEXT: ldr q2, [x0, #48] -; NO_SVE-NEXT: ldr q3, [x0, #32] -; NO_SVE-NEXT: ldr q4, [x0, #80] +; NO_SVE-NEXT: ldr q1, [x0, #112] +; NO_SVE-NEXT: ldr q2, [x0, #64] +; NO_SVE-NEXT: ldr q3, [x0, #80] +; NO_SVE-NEXT: ldr q4, [x0, #32] ; NO_SVE-NEXT: dup v21.4s, w8 -; NO_SVE-NEXT: ldr q5, [x0, #64] -; NO_SVE-NEXT: ldr q6, [x0, #112] -; NO_SVE-NEXT: ldr q7, [x0, #96] -; NO_SVE-NEXT: ldr q16, [x1, #16] -; NO_SVE-NEXT: ldr q17, [x1] -; NO_SVE-NEXT: ldr q18, [x1, #48] -; NO_SVE-NEXT: ldr q19, [x1, #32] +; NO_SVE-NEXT: ldr q5, [x0, #48] +; NO_SVE-NEXT: ldr q6, [x0] +; NO_SVE-NEXT: ldr q7, [x0, #16] +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q18, [x1, #64] +; NO_SVE-NEXT: ldr q19, [x1, #80] ; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b -; NO_SVE-NEXT: ldr q20, [x1, #80] +; NO_SVE-NEXT: ldr q20, [x1, #32] ; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b -; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: ldr q16, [x1, #48] ; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b -; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q17, [x1] ; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b -; NO_SVE-NEXT: ldr q18, [x1, #96] +; NO_SVE-NEXT: ldr q18, [x1, #16] ; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b -; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q0, q1, [x0, #96] ; NO_SVE-NEXT: mov v0.16b, v21.16b ; NO_SVE-NEXT: mov v1.16b, v21.16b -; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: stp q2, q3, [x0, #64] ; NO_SVE-NEXT: mov v2.16b, v21.16b ; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b ; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b ; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b -; NO_SVE-NEXT: stp q0, q4, [x0, #64] -; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: stp q4, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] ; NO_SVE-NEXT: ret ; ; VBITS_GE_1024-LABEL: select_v32f32: @@ -708,39 +708,39 @@ ; NO_SVE-LABEL: select_v16f64: ; NO_SVE: // %bb.0: ; NO_SVE-NEXT: tst w2, #0x1 -; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ldr q0, [x0, #96] ; NO_SVE-NEXT: csetm x8, ne -; NO_SVE-NEXT: ldr q1, [x0] -; NO_SVE-NEXT: ldr q2, [x0, #48] -; NO_SVE-NEXT: ldr q3, [x0, #32] -; NO_SVE-NEXT: ldr q4, [x0, #80] +; NO_SVE-NEXT: ldr q1, [x0, #112] +; NO_SVE-NEXT: ldr q2, [x0, #64] +; NO_SVE-NEXT: ldr q3, [x0, #80] +; NO_SVE-NEXT: ldr q4, [x0, #32] ; NO_SVE-NEXT: dup v21.2d, x8 -; NO_SVE-NEXT: ldr q5, [x0, #64] -; NO_SVE-NEXT: ldr q6, [x0, #112] -; NO_SVE-NEXT: ldr q7, [x0, #96] -; NO_SVE-NEXT: ldr q16, [x1, #16] -; NO_SVE-NEXT: ldr q17, [x1] -; NO_SVE-NEXT: ldr q18, [x1, #48] -; NO_SVE-NEXT: ldr q19, [x1, #32] +; NO_SVE-NEXT: ldr q5, [x0, #48] +; NO_SVE-NEXT: ldr q6, [x0] +; NO_SVE-NEXT: ldr q7, [x0, #16] +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q18, [x1, #64] +; NO_SVE-NEXT: ldr q19, [x1, #80] ; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b -; NO_SVE-NEXT: ldr q20, [x1, #80] +; NO_SVE-NEXT: ldr q20, [x1, #32] ; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b -; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: ldr q16, [x1, #48] ; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b -; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: ldr q17, [x1] ; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b -; NO_SVE-NEXT: ldr q18, [x1, #96] +; NO_SVE-NEXT: ldr q18, [x1, #16] ; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b -; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q0, q1, [x0, #96] ; NO_SVE-NEXT: mov v0.16b, v21.16b ; NO_SVE-NEXT: mov v1.16b, v21.16b -; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: stp q2, q3, [x0, #64] ; NO_SVE-NEXT: mov v2.16b, v21.16b ; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b ; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b ; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b -; NO_SVE-NEXT: stp q0, q4, [x0, #64] -; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: stp q4, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] ; NO_SVE-NEXT: ret ; ; VBITS_GE_1024-LABEL: select_v16f64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 @@ -22,6 +23,13 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.4h, v2.4h, #15 +; NO_SVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.4h, v2.4h, #15 @@ -34,6 +42,14 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: shl v2.8h, v2.8h, #15 +; NO_SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 @@ -46,14 +62,25 @@ } define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: select_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: fcmeq v4.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v5.8h, v0.8h, v3.8h +; NO_SVE-NEXT: bif v1.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bif v0.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a %op2 = load <16 x half>, <16 x half>* %b @@ -64,15 +91,33 @@ } define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: select_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x1, #32] +; NO_SVE-NEXT: fcmeq v16.8h, v3.8h, v5.8h +; NO_SVE-NEXT: ldp q7, q6, [x1] +; NO_SVE-NEXT: fcmeq v17.8h, v2.8h, v4.8h +; NO_SVE-NEXT: bif v3.16b, v5.16b, v16.16b +; NO_SVE-NEXT: fcmeq v18.8h, v1.8h, v7.8h +; NO_SVE-NEXT: bif v2.16b, v4.16b, v17.16b +; NO_SVE-NEXT: fcmeq v19.8h, v0.8h, v6.8h +; NO_SVE-NEXT: bif v1.16b, v7.16b, v18.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v6.16b, v19.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v32f16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a %op2 = load <32 x half>, <32 x half>* %b %mask = fcmp oeq <32 x half> %op1, %op2 @@ -82,14 +127,46 @@ } define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: select_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: ldp q17, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v20.8h, v7.8h, v17.8h +; NO_SVE-NEXT: ldp q19, q18, [x1, #64] +; NO_SVE-NEXT: fcmeq v22.8h, v6.8h, v16.8h +; NO_SVE-NEXT: bif v7.16b, v17.16b, v20.16b +; NO_SVE-NEXT: fcmeq v24.8h, v5.8h, v19.8h +; NO_SVE-NEXT: bif v6.16b, v16.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q21, [x1, #32] +; NO_SVE-NEXT: fcmeq v26.8h, v4.8h, v18.8h +; NO_SVE-NEXT: bif v5.16b, v19.16b, v24.16b +; NO_SVE-NEXT: fcmeq v28.8h, v3.8h, v23.8h +; NO_SVE-NEXT: bif v4.16b, v18.16b, v26.16b +; NO_SVE-NEXT: ldp q27, q25, [x1] +; NO_SVE-NEXT: fcmeq v29.8h, v2.8h, v21.8h +; NO_SVE-NEXT: stp q7, q6, [x0, #96] +; NO_SVE-NEXT: bif v3.16b, v23.16b, v28.16b +; NO_SVE-NEXT: stp q5, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v30.8h, v1.8h, v27.8h +; NO_SVE-NEXT: bif v2.16b, v21.16b, v29.16b +; NO_SVE-NEXT: fcmeq v31.8h, v0.8h, v25.8h +; NO_SVE-NEXT: bif v1.16b, v27.16b, v30.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v25.16b, v31.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v64f16: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <64 x half>, <64 x half>* %a %op2 = load <64 x half>, <64 x half>* %b @@ -100,14 +177,76 @@ } define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { +; NO_SVE-LABEL: select_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ldp q6, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v20.8h, v0.8h, v6.8h +; NO_SVE-NEXT: ldp q21, q23, [x0, #96] +; NO_SVE-NEXT: fcmeq v26.8h, v1.8h, v7.8h +; NO_SVE-NEXT: bif v0.16b, v6.16b, v20.16b +; NO_SVE-NEXT: bif v1.16b, v7.16b, v26.16b +; NO_SVE-NEXT: ldp q25, q27, [x1, #96] +; NO_SVE-NEXT: fcmeq v6.8h, v21.8h, v25.8h +; NO_SVE-NEXT: ldp q2, q3, [x0, #128] +; NO_SVE-NEXT: fcmeq v20.8h, v23.8h, v27.8h +; NO_SVE-NEXT: bsl v6.16b, v21.16b, v25.16b +; NO_SVE-NEXT: bsl v20.16b, v23.16b, v27.16b +; NO_SVE-NEXT: ldp q18, q19, [x1, #128] +; NO_SVE-NEXT: fcmeq v7.8h, v2.8h, v18.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v24.8h, v3.8h, v19.8h +; NO_SVE-NEXT: bif v2.16b, v18.16b, v7.16b +; NO_SVE-NEXT: bif v3.16b, v19.16b, v24.16b +; NO_SVE-NEXT: ldp q16, q17, [x1, #160] +; NO_SVE-NEXT: fcmeq v25.8h, v4.8h, v16.8h +; NO_SVE-NEXT: ldp q26, q21, [x0, #224] +; NO_SVE-NEXT: fcmeq v22.8h, v5.8h, v17.8h +; NO_SVE-NEXT: bif v4.16b, v16.16b, v25.16b +; NO_SVE-NEXT: bif v5.16b, v17.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q27, [x1, #224] +; NO_SVE-NEXT: fcmeq v25.8h, v26.8h, v23.8h +; NO_SVE-NEXT: ldp q7, q18, [x0, #192] +; NO_SVE-NEXT: fcmeq v19.8h, v21.8h, v27.8h +; NO_SVE-NEXT: bsl v19.16b, v21.16b, v27.16b +; NO_SVE-NEXT: ldp q24, q16, [x1, #192] +; NO_SVE-NEXT: mov v21.16b, v25.16b +; NO_SVE-NEXT: bsl v21.16b, v26.16b, v23.16b +; NO_SVE-NEXT: fcmeq v22.8h, v7.8h, v24.8h +; NO_SVE-NEXT: fcmeq v17.8h, v18.8h, v16.8h +; NO_SVE-NEXT: bif v7.16b, v24.16b, v22.16b +; NO_SVE-NEXT: bit v16.16b, v18.16b, v17.16b +; NO_SVE-NEXT: ldp q23, q17, [x1] +; NO_SVE-NEXT: ldp q18, q22, [x1, #32] +; NO_SVE-NEXT: stp q0, q1, [x0, #64] +; NO_SVE-NEXT: stp q6, q20, [x0, #96] +; NO_SVE-NEXT: stp q2, q3, [x0, #128] +; NO_SVE-NEXT: stp q4, q5, [x0, #160] +; NO_SVE-NEXT: stp q7, q16, [x0, #192] +; NO_SVE-NEXT: stp q21, q19, [x0, #224] +; NO_SVE-NEXT: ldp q19, q21, [x0, #32] +; NO_SVE-NEXT: fcmeq v4.8h, v19.8h, v18.8h +; NO_SVE-NEXT: ldp q7, q16, [x0] +; NO_SVE-NEXT: fcmeq v5.8h, v21.8h, v22.8h +; NO_SVE-NEXT: bsl v4.16b, v19.16b, v18.16b +; NO_SVE-NEXT: fcmeq v2.8h, v7.8h, v23.8h +; NO_SVE-NEXT: bsl v5.16b, v21.16b, v22.16b +; NO_SVE-NEXT: fcmeq v3.8h, v16.8h, v17.8h +; NO_SVE-NEXT: mov v1.16b, v2.16b +; NO_SVE-NEXT: bsl v1.16b, v7.16b, v23.16b +; NO_SVE-NEXT: stp q4, q5, [x0, #32] +; NO_SVE-NEXT: bsl v3.16b, v16.16b, v17.16b +; NO_SVE-NEXT: stp q1, q3, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v128f16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <128 x half>, <128 x half>* %a %op2 = load <128 x half>, <128 x half>* %b @@ -119,6 +258,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.2s, v2.2s, #31 +; NO_SVE-NEXT: cmlt v2.2s, v2.2s, #0 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.2s, v2.2s, #31 @@ -131,6 +277,14 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: shl v2.4s, v2.4s, #31 +; NO_SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 @@ -143,14 +297,25 @@ } define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: select_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: fcmeq v4.4s, v1.4s, v2.4s +; NO_SVE-NEXT: fcmeq v5.4s, v0.4s, v3.4s +; NO_SVE-NEXT: bif v1.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bif v0.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a %op2 = load <8 x float>, <8 x float>* %b @@ -161,15 +326,33 @@ } define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: select_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x1, #32] +; NO_SVE-NEXT: fcmeq v16.4s, v3.4s, v5.4s +; NO_SVE-NEXT: ldp q7, q6, [x1] +; NO_SVE-NEXT: fcmeq v17.4s, v2.4s, v4.4s +; NO_SVE-NEXT: bif v3.16b, v5.16b, v16.16b +; NO_SVE-NEXT: fcmeq v18.4s, v1.4s, v7.4s +; NO_SVE-NEXT: bif v2.16b, v4.16b, v17.16b +; NO_SVE-NEXT: fcmeq v19.4s, v0.4s, v6.4s +; NO_SVE-NEXT: bif v1.16b, v7.16b, v18.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v6.16b, v19.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512: ptrue p0.s, vl16 -; VBITS_GE_512: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_512: st1w { z0.s }, p0, [x0] -; VBITS_GE_512: ret +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %op2 = load <16 x float>, <16 x float>* %b %mask = fcmp oeq <16 x float> %op1, %op2 @@ -179,15 +362,47 @@ } define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: select_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: ldp q17, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v20.4s, v7.4s, v17.4s +; NO_SVE-NEXT: ldp q19, q18, [x1, #64] +; NO_SVE-NEXT: fcmeq v22.4s, v6.4s, v16.4s +; NO_SVE-NEXT: bif v7.16b, v17.16b, v20.16b +; NO_SVE-NEXT: fcmeq v24.4s, v5.4s, v19.4s +; NO_SVE-NEXT: bif v6.16b, v16.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q21, [x1, #32] +; NO_SVE-NEXT: fcmeq v26.4s, v4.4s, v18.4s +; NO_SVE-NEXT: bif v5.16b, v19.16b, v24.16b +; NO_SVE-NEXT: fcmeq v28.4s, v3.4s, v23.4s +; NO_SVE-NEXT: bif v4.16b, v18.16b, v26.16b +; NO_SVE-NEXT: ldp q27, q25, [x1] +; NO_SVE-NEXT: fcmeq v29.4s, v2.4s, v21.4s +; NO_SVE-NEXT: stp q7, q6, [x0, #96] +; NO_SVE-NEXT: bif v3.16b, v23.16b, v28.16b +; NO_SVE-NEXT: stp q5, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v30.4s, v1.4s, v27.4s +; NO_SVE-NEXT: bif v2.16b, v21.16b, v29.16b +; NO_SVE-NEXT: fcmeq v31.4s, v0.4s, v25.4s +; NO_SVE-NEXT: bif v1.16b, v27.16b, v30.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v25.16b, v31.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v32f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a %op2 = load <32 x float>, <32 x float>* %b %mask = fcmp oeq <32 x float> %op1, %op2 @@ -197,14 +412,76 @@ } define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: select_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ldp q6, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v20.4s, v0.4s, v6.4s +; NO_SVE-NEXT: ldp q21, q23, [x0, #96] +; NO_SVE-NEXT: fcmeq v26.4s, v1.4s, v7.4s +; NO_SVE-NEXT: bif v0.16b, v6.16b, v20.16b +; NO_SVE-NEXT: bif v1.16b, v7.16b, v26.16b +; NO_SVE-NEXT: ldp q25, q27, [x1, #96] +; NO_SVE-NEXT: fcmeq v6.4s, v21.4s, v25.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #128] +; NO_SVE-NEXT: fcmeq v20.4s, v23.4s, v27.4s +; NO_SVE-NEXT: bsl v6.16b, v21.16b, v25.16b +; NO_SVE-NEXT: bsl v20.16b, v23.16b, v27.16b +; NO_SVE-NEXT: ldp q18, q19, [x1, #128] +; NO_SVE-NEXT: fcmeq v7.4s, v2.4s, v18.4s +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v24.4s, v3.4s, v19.4s +; NO_SVE-NEXT: bif v2.16b, v18.16b, v7.16b +; NO_SVE-NEXT: bif v3.16b, v19.16b, v24.16b +; NO_SVE-NEXT: ldp q16, q17, [x1, #160] +; NO_SVE-NEXT: fcmeq v25.4s, v4.4s, v16.4s +; NO_SVE-NEXT: ldp q26, q21, [x0, #224] +; NO_SVE-NEXT: fcmeq v22.4s, v5.4s, v17.4s +; NO_SVE-NEXT: bif v4.16b, v16.16b, v25.16b +; NO_SVE-NEXT: bif v5.16b, v17.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q27, [x1, #224] +; NO_SVE-NEXT: fcmeq v25.4s, v26.4s, v23.4s +; NO_SVE-NEXT: ldp q7, q18, [x0, #192] +; NO_SVE-NEXT: fcmeq v19.4s, v21.4s, v27.4s +; NO_SVE-NEXT: bsl v19.16b, v21.16b, v27.16b +; NO_SVE-NEXT: ldp q24, q16, [x1, #192] +; NO_SVE-NEXT: mov v21.16b, v25.16b +; NO_SVE-NEXT: bsl v21.16b, v26.16b, v23.16b +; NO_SVE-NEXT: fcmeq v22.4s, v7.4s, v24.4s +; NO_SVE-NEXT: fcmeq v17.4s, v18.4s, v16.4s +; NO_SVE-NEXT: bif v7.16b, v24.16b, v22.16b +; NO_SVE-NEXT: bit v16.16b, v18.16b, v17.16b +; NO_SVE-NEXT: ldp q23, q17, [x1] +; NO_SVE-NEXT: ldp q18, q22, [x1, #32] +; NO_SVE-NEXT: stp q0, q1, [x0, #64] +; NO_SVE-NEXT: stp q6, q20, [x0, #96] +; NO_SVE-NEXT: stp q2, q3, [x0, #128] +; NO_SVE-NEXT: stp q4, q5, [x0, #160] +; NO_SVE-NEXT: stp q7, q16, [x0, #192] +; NO_SVE-NEXT: stp q21, q19, [x0, #224] +; NO_SVE-NEXT: ldp q19, q21, [x0, #32] +; NO_SVE-NEXT: fcmeq v4.4s, v19.4s, v18.4s +; NO_SVE-NEXT: ldp q7, q16, [x0] +; NO_SVE-NEXT: fcmeq v5.4s, v21.4s, v22.4s +; NO_SVE-NEXT: bsl v4.16b, v19.16b, v18.16b +; NO_SVE-NEXT: fcmeq v2.4s, v7.4s, v23.4s +; NO_SVE-NEXT: bsl v5.16b, v21.16b, v22.16b +; NO_SVE-NEXT: fcmeq v3.4s, v16.4s, v17.4s +; NO_SVE-NEXT: mov v1.16b, v2.16b +; NO_SVE-NEXT: bsl v1.16b, v7.16b, v23.16b +; NO_SVE-NEXT: stp q4, q5, [x0, #32] +; NO_SVE-NEXT: bsl v3.16b, v16.16b, v17.16b +; NO_SVE-NEXT: stp q1, q3, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v64f32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a %op2 = load <64 x float>, <64 x float>* %b @@ -216,6 +493,14 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: tst w0, #0x1 +; NO_SVE-NEXT: csetm x8, ne +; NO_SVE-NEXT: fmov d2, x8 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 @@ -229,6 +514,14 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: shl v2.2d, v2.2d, #63 +; NO_SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.2d, v2.2s, #0 @@ -241,14 +534,25 @@ } define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: select_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: fcmeq v4.2d, v1.2d, v2.2d +; NO_SVE-NEXT: fcmeq v5.2d, v0.2d, v3.2d +; NO_SVE-NEXT: bif v1.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bif v0.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %op2 = load <4 x double>, <4 x double>* %b @@ -259,14 +563,32 @@ } define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: select_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x1, #32] +; NO_SVE-NEXT: fcmeq v16.2d, v3.2d, v5.2d +; NO_SVE-NEXT: ldp q7, q6, [x1] +; NO_SVE-NEXT: fcmeq v17.2d, v2.2d, v4.2d +; NO_SVE-NEXT: bif v3.16b, v5.16b, v16.16b +; NO_SVE-NEXT: fcmeq v18.2d, v1.2d, v7.2d +; NO_SVE-NEXT: bif v2.16b, v4.16b, v17.16b +; NO_SVE-NEXT: fcmeq v19.2d, v0.2d, v6.2d +; NO_SVE-NEXT: bif v1.16b, v7.16b, v18.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v6.16b, v19.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %op2 = load <8 x double>, <8 x double>* %b @@ -277,14 +599,46 @@ } define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: select_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: ldp q17, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v20.2d, v7.2d, v17.2d +; NO_SVE-NEXT: ldp q19, q18, [x1, #64] +; NO_SVE-NEXT: fcmeq v22.2d, v6.2d, v16.2d +; NO_SVE-NEXT: bif v7.16b, v17.16b, v20.16b +; NO_SVE-NEXT: fcmeq v24.2d, v5.2d, v19.2d +; NO_SVE-NEXT: bif v6.16b, v16.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q21, [x1, #32] +; NO_SVE-NEXT: fcmeq v26.2d, v4.2d, v18.2d +; NO_SVE-NEXT: bif v5.16b, v19.16b, v24.16b +; NO_SVE-NEXT: fcmeq v28.2d, v3.2d, v23.2d +; NO_SVE-NEXT: bif v4.16b, v18.16b, v26.16b +; NO_SVE-NEXT: ldp q27, q25, [x1] +; NO_SVE-NEXT: fcmeq v29.2d, v2.2d, v21.2d +; NO_SVE-NEXT: stp q7, q6, [x0, #96] +; NO_SVE-NEXT: bif v3.16b, v23.16b, v28.16b +; NO_SVE-NEXT: stp q5, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v30.2d, v1.2d, v27.2d +; NO_SVE-NEXT: bif v2.16b, v21.16b, v29.16b +; NO_SVE-NEXT: fcmeq v31.2d, v0.2d, v25.2d +; NO_SVE-NEXT: bif v1.16b, v27.16b, v30.16b +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: bif v0.16b, v25.16b, v31.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v16f64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %op2 = load <16 x double>, <16 x double>* %b @@ -295,14 +649,76 @@ } define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: select_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ldp q6, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v20.2d, v0.2d, v6.2d +; NO_SVE-NEXT: ldp q21, q23, [x0, #96] +; NO_SVE-NEXT: fcmeq v26.2d, v1.2d, v7.2d +; NO_SVE-NEXT: bif v0.16b, v6.16b, v20.16b +; NO_SVE-NEXT: bif v1.16b, v7.16b, v26.16b +; NO_SVE-NEXT: ldp q25, q27, [x1, #96] +; NO_SVE-NEXT: fcmeq v6.2d, v21.2d, v25.2d +; NO_SVE-NEXT: ldp q2, q3, [x0, #128] +; NO_SVE-NEXT: fcmeq v20.2d, v23.2d, v27.2d +; NO_SVE-NEXT: bsl v6.16b, v21.16b, v25.16b +; NO_SVE-NEXT: bsl v20.16b, v23.16b, v27.16b +; NO_SVE-NEXT: ldp q18, q19, [x1, #128] +; NO_SVE-NEXT: fcmeq v7.2d, v2.2d, v18.2d +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v24.2d, v3.2d, v19.2d +; NO_SVE-NEXT: bif v2.16b, v18.16b, v7.16b +; NO_SVE-NEXT: bif v3.16b, v19.16b, v24.16b +; NO_SVE-NEXT: ldp q16, q17, [x1, #160] +; NO_SVE-NEXT: fcmeq v25.2d, v4.2d, v16.2d +; NO_SVE-NEXT: ldp q26, q21, [x0, #224] +; NO_SVE-NEXT: fcmeq v22.2d, v5.2d, v17.2d +; NO_SVE-NEXT: bif v4.16b, v16.16b, v25.16b +; NO_SVE-NEXT: bif v5.16b, v17.16b, v22.16b +; NO_SVE-NEXT: ldp q23, q27, [x1, #224] +; NO_SVE-NEXT: fcmeq v25.2d, v26.2d, v23.2d +; NO_SVE-NEXT: ldp q7, q18, [x0, #192] +; NO_SVE-NEXT: fcmeq v19.2d, v21.2d, v27.2d +; NO_SVE-NEXT: bsl v19.16b, v21.16b, v27.16b +; NO_SVE-NEXT: ldp q24, q16, [x1, #192] +; NO_SVE-NEXT: mov v21.16b, v25.16b +; NO_SVE-NEXT: bsl v21.16b, v26.16b, v23.16b +; NO_SVE-NEXT: fcmeq v22.2d, v7.2d, v24.2d +; NO_SVE-NEXT: fcmeq v17.2d, v18.2d, v16.2d +; NO_SVE-NEXT: bif v7.16b, v24.16b, v22.16b +; NO_SVE-NEXT: bit v16.16b, v18.16b, v17.16b +; NO_SVE-NEXT: ldp q23, q17, [x1] +; NO_SVE-NEXT: ldp q18, q22, [x1, #32] +; NO_SVE-NEXT: stp q0, q1, [x0, #64] +; NO_SVE-NEXT: stp q6, q20, [x0, #96] +; NO_SVE-NEXT: stp q2, q3, [x0, #128] +; NO_SVE-NEXT: stp q4, q5, [x0, #160] +; NO_SVE-NEXT: stp q7, q16, [x0, #192] +; NO_SVE-NEXT: stp q21, q19, [x0, #224] +; NO_SVE-NEXT: ldp q19, q21, [x0, #32] +; NO_SVE-NEXT: fcmeq v4.2d, v19.2d, v18.2d +; NO_SVE-NEXT: ldp q7, q16, [x0] +; NO_SVE-NEXT: fcmeq v5.2d, v21.2d, v22.2d +; NO_SVE-NEXT: bsl v4.16b, v19.16b, v18.16b +; NO_SVE-NEXT: fcmeq v2.2d, v7.2d, v23.2d +; NO_SVE-NEXT: bsl v5.16b, v21.16b, v22.16b +; NO_SVE-NEXT: fcmeq v3.2d, v16.2d, v17.2d +; NO_SVE-NEXT: mov v1.16b, v2.16b +; NO_SVE-NEXT: bsl v1.16b, v7.16b, v23.16b +; NO_SVE-NEXT: stp q4, q5, [x0, #32] +; NO_SVE-NEXT: bsl v3.16b, v16.16b, v17.16b +; NO_SVE-NEXT: stp q1, q3, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %op2 = load <32 x double>, <32 x double>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -8,13 +9,66 @@ ; spill slot. define dso_local void @func1(i64* %v1, i64* %v2, i64* %v3, i64* %v4, i64* %v5, i64* %v6, i64* %v7, i64* %v8, +; CHECK-LABEL: func1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -64 +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x20, sp, #192 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] +; CHECK-NEXT: ldp x18, x19, [sp, #368] +; CHECK-NEXT: add x21, sp, #160 +; CHECK-NEXT: add x22, sp, #128 +; CHECK-NEXT: ldp x24, x14, [sp, #296] +; CHECK-NEXT: add x23, sp, #64 +; CHECK-NEXT: ldr x25, [sp, #288] +; CHECK-NEXT: ldp x9, x8, [sp, #344] +; CHECK-NEXT: ldp x11, x10, [sp, #328] +; CHECK-NEXT: ldp x13, x12, [sp, #312] +; CHECK-NEXT: ldr x15, [sp, #120] +; CHECK-NEXT: ldur q4, [sp, #104] +; CHECK-NEXT: ldp x16, x17, [sp, #224] +; CHECK-NEXT: st1d { z3.d }, p0, [x20] +; CHECK-NEXT: st1d { z2.d }, p0, [x21] +; CHECK-NEXT: st1d { z1.d }, p0, [x22] +; CHECK-NEXT: st1d { z0.d }, p0, [x23] +; CHECK-NEXT: stp x18, x19, [sp, #368] +; CHECK-NEXT: stp x25, x24, [sp, #288] +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: stp x16, x17, [sp, #224] +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stur q4, [sp, #104] +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: str x15, [sp, #120] +; CHECK-NEXT: stp x14, x13, [sp, #304] +; CHECK-NEXT: stp x12, x11, [sp, #320] +; CHECK-NEXT: stp x10, x9, [sp, #336] +; CHECK-NEXT: str x8, [sp, #352] +; CHECK-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: b func2 i64* %v9, i64* %v10, i64* %v11, i64* %v12, i64* %v13, i64* %v14, i64* %v15, i64* %v16, i64* %v17, i64* %v18, i64* %v19, i64* %v20, i64* %v21, i64* %v22, i64* %v23, i64* %v24, i64* %v25, i64* %v26, i64* %v27, i64* %v28, i64* %v29, i64* %v30, i64* %v31, i64* %v32, i64* %v33, i64* %v34, i64* %v35, i64* %v36, i64* %v37, i64* %v38, i64* %v39, i64* %v40, i64* %v41, i64* %v42, i64* %v43, i64* %v44, i64* %v45, i64* %v46, i64* %v47, i64* %v48, i64 %v49) #0 { -; CHECK-LABEL: func1 tail call void @func2(i64* %v1, i64* %v2, i64* %v3, i64* %v4, i64* %v5, i64* %v6, i64* %v7, i64* %v8, i64* %v9, i64* %v10, i64* %v11, i64* %v12, i64* undef, i64* %v14, i64* %v15, i64* %v16, i64* %v17, i64* %v18, i64* %v19, i64* %v20, i64* %v21, i64* %v22, i64* %v23, i64* %v24, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 @@ -26,6 +27,14 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 { +; NO_SVE-LABEL: insertelement_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h1, #5.00000000 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: mov v0.h[3], v1.h[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov h1, #5.00000000 @@ -39,6 +48,12 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 { +; NO_SVE-LABEL: insertelement_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h1, #5.00000000 +; NO_SVE-NEXT: mov v0.h[7], v1.h[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov h1, #5.00000000 @@ -49,6 +64,14 @@ } define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h0, #5.00000000 +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov v1.h[7], v0.h[0] +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v16f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #15 @@ -68,6 +91,15 @@ } define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h2, #5.00000000 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v3.h[7], v2.h[0] +; NO_SVE-NEXT: ldr q2, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #31 @@ -87,6 +119,16 @@ } define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov h5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.h[7], v5.h[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #63 @@ -106,6 +148,28 @@ } define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov h0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.h[7], v0.h[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #127 @@ -126,6 +190,14 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 { +; NO_SVE-LABEL: insertelement_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: mov v0.s[1], v1.s[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v2f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov s1, #5.00000000 @@ -139,6 +211,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 { +; NO_SVE-LABEL: insertelement_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: mov v0.s[3], v1.s[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov s1, #5.00000000 @@ -149,6 +227,13 @@ } define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fmov s2, #5.00000000 +; NO_SVE-NEXT: mov v1.s[3], v2.s[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #7 @@ -168,6 +253,14 @@ } define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: mov v3.s[3], v1.s[0] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #15 @@ -187,6 +280,16 @@ } define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov s5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.s[3], v5.s[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #31 @@ -206,6 +309,28 @@ } define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov s0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.s[3], v0.s[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #63 @@ -226,6 +351,11 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 { +; NO_SVE-LABEL: insertelement_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v1f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov d0, #5.00000000 @@ -236,6 +366,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 { +; NO_SVE-LABEL: insertelement_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d1, #5.00000000 +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v2f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov d1, #5.00000000 @@ -246,6 +382,14 @@ } define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #3 @@ -265,6 +409,15 @@ } define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d2, #5.00000000 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v3.d[1], v2.d[0] +; NO_SVE-NEXT: ldr q2, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #7 @@ -284,6 +437,16 @@ } define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov d5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.d[1], v5.d[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #15 @@ -303,6 +466,28 @@ } define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.d[1], v0.d[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK @@ -32,6 +33,12 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v8i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b @@ -66,6 +73,13 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v16i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull2 v2.8h, v0.16b, v1.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b @@ -89,6 +103,29 @@ ; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v32i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: smull v4.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: smull2 v0.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v5.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: smull2 v1.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: smulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %1 = sext <32 x i8> %op1 to <32 x i16> @@ -101,6 +138,32 @@ } define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v64i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v16.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v3.8h, v4.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: smulh_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -123,6 +186,54 @@ } define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v128i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v17.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v19.8h, v4.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v21.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: smull v3.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: smull2 v23.8h, v18.16b, v20.16b +; VBITS_EQ_128-NEXT: smull v18.8h, v18.8b, v20.8b +; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8 +; VBITS_EQ_128-NEXT: smull v20.8h, v16.8b, v5.8b +; VBITS_EQ_128-NEXT: smull2 v5.8h, v16.16b, v5.16b +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull v25.8h, v22.8b, v24.8b +; VBITS_EQ_128-NEXT: smull2 v16.8h, v22.16b, v24.16b +; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8 +; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -144,6 +255,121 @@ } define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v256i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v0.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: smull2 v0.8h, v2.16b, v6.16b +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: smull v6.8h, v2.8b, v6.8b +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: smull2 v2.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8 +; VBITS_EQ_128-NEXT: smull v5.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: smull2 v3.8h, v7.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v7.8h, v7.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v31.8h, v19.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v9.8h, v19.8b, v16.8b +; VBITS_EQ_128-NEXT: smull2 v21.8h, v18.16b, v17.16b +; VBITS_EQ_128-NEXT: smull v30.8h, v18.8b, v17.8b +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8 +; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: smull2 v16.8h, v17.16b, v20.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v18.8h, v17.8b, v20.8b +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: smull2 v17.8h, v22.16b, v19.16b +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: smull v19.8h, v22.8b, v19.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v12.8h, v24.16b, v22.16b +; VBITS_EQ_128-NEXT: smull v13.8h, v24.8b, v22.8b +; VBITS_EQ_128-NEXT: smull2 v10.8h, v20.16b, v23.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v11.8h, v20.8b, v23.8b +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: smull2 v22.8h, v26.16b, v24.16b +; VBITS_EQ_128-NEXT: smull v24.8h, v26.8b, v24.8b +; VBITS_EQ_128-NEXT: smull2 v20.8h, v23.16b, v25.16b +; VBITS_EQ_128-NEXT: smull v23.8h, v23.8b, v25.8b +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v15.8h, v28.16b, v26.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v28.8b, v26.8b +; VBITS_EQ_128-NEXT: smull2 v14.8h, v25.16b, v27.16b +; VBITS_EQ_128-NEXT: smull v8.8h, v25.8b, v27.8b +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v26.8h, v0.16b, v28.16b +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8 +; VBITS_EQ_128-NEXT: smull v28.8h, v0.8b, v28.8b +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: smull2 v25.8h, v27.16b, v29.16b +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: smull v27.8h, v27.8b, v29.8b +; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -166,6 +392,12 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v4i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h @@ -189,6 +421,13 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v8i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull2 v2.4s, v0.8h, v1.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h @@ -212,6 +451,29 @@ ; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v16i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: smull v4.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: smull2 v0.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v5.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: smull2 v1.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: smulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %1 = sext <16 x i16> %op1 to <16 x i32> @@ -224,6 +486,32 @@ } define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v16.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v3.4s, v4.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: smulh_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -244,6 +532,54 @@ } define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v64i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v17.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v19.4s, v4.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v21.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: smull v3.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: smull2 v23.4s, v18.8h, v20.8h +; VBITS_EQ_128-NEXT: smull v18.4s, v18.4h, v20.4h +; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16 +; VBITS_EQ_128-NEXT: smull v20.4s, v16.4h, v5.4h +; VBITS_EQ_128-NEXT: smull2 v5.4s, v16.8h, v5.8h +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull v25.4s, v22.4h, v24.4h +; VBITS_EQ_128-NEXT: smull2 v16.4s, v22.8h, v24.8h +; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16 +; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -264,6 +600,121 @@ } define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v128i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v0.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: smull2 v0.4s, v2.8h, v6.8h +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: smull v6.4s, v2.4h, v6.4h +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: smull2 v2.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16 +; VBITS_EQ_128-NEXT: smull v5.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: smull2 v3.4s, v7.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v7.4s, v7.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v31.4s, v19.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v9.4s, v19.4h, v16.4h +; VBITS_EQ_128-NEXT: smull2 v21.4s, v18.8h, v17.8h +; VBITS_EQ_128-NEXT: smull v30.4s, v18.4h, v17.4h +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16 +; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: smull2 v16.4s, v17.8h, v20.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v18.4s, v17.4h, v20.4h +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: smull2 v17.4s, v22.8h, v19.8h +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: smull v19.4s, v22.4h, v19.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v12.4s, v24.8h, v22.8h +; VBITS_EQ_128-NEXT: smull v13.4s, v24.4h, v22.4h +; VBITS_EQ_128-NEXT: smull2 v10.4s, v20.8h, v23.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v11.4s, v20.4h, v23.4h +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: smull2 v22.4s, v26.8h, v24.8h +; VBITS_EQ_128-NEXT: smull v24.4s, v26.4h, v24.4h +; VBITS_EQ_128-NEXT: smull2 v20.4s, v23.8h, v25.8h +; VBITS_EQ_128-NEXT: smull v23.4s, v23.4h, v25.4h +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v15.4s, v28.8h, v26.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v28.4h, v26.4h +; VBITS_EQ_128-NEXT: smull2 v14.4s, v25.8h, v27.8h +; VBITS_EQ_128-NEXT: smull v8.4s, v25.4h, v27.4h +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v26.4s, v0.8h, v28.8h +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16 +; VBITS_EQ_128-NEXT: smull v28.4s, v0.4h, v28.4h +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: smull2 v25.4s, v27.8h, v29.8h +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: smull v27.4s, v27.4h, v29.4h +; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -285,6 +736,15 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v2i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 @@ -294,13 +754,6 @@ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret -; VBITS_EQ_128-LABEL: smulh_v2i32: -; VBITS_EQ_128: sshll v0.2d, v0.2s, #0 -; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 -; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0 -; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 -; VBITS_EQ_128-NEXT: ret %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> @@ -312,6 +765,13 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v4i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull2 v2.2d, v0.4s, v1.4s +; VBITS_EQ_128-NEXT: smull v0.2d, v0.2s, v1.2s +; VBITS_EQ_128-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -335,6 +795,38 @@ ; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v8i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v5.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: sshll v4.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: sshll v7.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v6.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: smulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %1 = sext <8 x i32> %op1 to <8 x i64> @@ -347,6 +839,49 @@ } define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v16i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v19.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: sshll v18.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v7.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32] +; VBITS_EQ_128-NEXT: sshll v0.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v4.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v21.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: sshll v5.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: sshll2 v22.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d +; VBITS_EQ_128-NEXT: sshll v6.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d +; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d +; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q0, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: smulh_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -367,6 +902,95 @@ } define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -32 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v27.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v29.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96] +; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v22.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v31.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v8.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96] +; VBITS_EQ_128-NEXT: sshll v30.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v28.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v9.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: sshll v26.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64] +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d +; VBITS_EQ_128-NEXT: sshll2 v10.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: sshll v25.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v31.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d +; VBITS_EQ_128-NEXT: sshll v24.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d +; VBITS_EQ_128-NEXT: sshll2 v30.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: sshll v21.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: sshll v6.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d +; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d +; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32] +; VBITS_EQ_128-NEXT: sshll v4.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v5.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v7.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v27.2d, v20.4s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v20.2s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q1, [x0] +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d +; VBITS_EQ_128-NEXT: sshll2 v21.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: sshll v18.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: sshll v2.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: sshll v0.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: sshll v20.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v23.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d +; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64] +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32] +; VBITS_EQ_128-NEXT: stp q2, q0, [x0] +; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96] +; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -387,6 +1011,267 @@ } define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v64i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80 +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -64 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -72 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -80 +; VBITS_EQ_128-NEXT: addvl sp, sp, #-12 +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48] +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldr q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldr q3, [x0, #80] +; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: sshll v1.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128] +; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160] +; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v27.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192] +; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v9.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v12.2d, v30.4s, #0 +; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224] +; VBITS_EQ_128-NEXT: sshll v11.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v8.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll v10.2d, v31.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v15.2d, v31.4s, #0 +; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224] +; VBITS_EQ_128-NEXT: sshll2 v18.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: sshll v31.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192] +; VBITS_EQ_128-NEXT: sshll v1.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v19.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d +; VBITS_EQ_128-NEXT: ldp q21, q22, [x0] +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d +; VBITS_EQ_128-NEXT: sshll v18.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v14.2s, #0 +; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96] +; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64] +; VBITS_EQ_128-NEXT: movprfx z0, z31 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z0, z15 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v30.2s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32] +; VBITS_EQ_128-NEXT: movprfx z15, z10 +; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d +; VBITS_EQ_128-NEXT: movprfx z0, z8 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d +; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v14.4s, #0 +; VBITS_EQ_128-NEXT: ldp q19, q18, [x1] +; VBITS_EQ_128-NEXT: movprfx z10, z12 +; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d +; VBITS_EQ_128-NEXT: movprfx z8, z1 +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v13.4s, #0 +; VBITS_EQ_128-NEXT: sshll v12.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v13.2s, #0 +; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d +; VBITS_EQ_128-NEXT: sshll v20.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z13, z20 +; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z14, z6 +; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d +; VBITS_EQ_128-NEXT: sshll v4.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z30, z4 +; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll v1.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z31, z4 +; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d +; VBITS_EQ_128-NEXT: sshll v6.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z28, z6 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z23, z3 +; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z20, z5 +; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: sshll v4.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z7, z1 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll v3.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z6, z3 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v5.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z26, z1 +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll v3.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z24, z5 +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d +; VBITS_EQ_128-NEXT: sshll v16.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z25, z1 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z29, z16 +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z4, z1 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v22.2s, #0 +; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v22.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z22, z0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d +; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll v1.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll v3.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: sshll2 v18.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: movprfx z2, z5 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d +; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d +; VBITS_EQ_128-NEXT: sshll2 v5.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v19.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v19.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32 +; VBITS_EQ_128-NEXT: sshll v3.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32 +; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32 +; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32 +; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32 +; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32 +; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32] +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32 +; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32 +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224] +; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32 +; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128] +; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64] +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192] +; VBITS_EQ_128-NEXT: addvl sp, sp, #12 +; VBITS_EQ_128-NEXT: add sp, sp, #80 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -408,6 +1293,16 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v1i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1 +; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_EQ_128-NEXT: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -428,6 +1323,19 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; VBITS_EQ_128-LABEL: smulh_v2i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: smulh x10, x10, x11 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -445,6 +1353,31 @@ } define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v4i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: mov x10, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: mov x8, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mov x12, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d2 +; VBITS_EQ_128-NEXT: mov x14, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d3 +; VBITS_EQ_128-NEXT: smulh x11, x11, x13 +; VBITS_EQ_128-NEXT: smulh x10, x10, x12 +; VBITS_EQ_128-NEXT: smulh x9, x9, x15 +; VBITS_EQ_128-NEXT: smulh x8, x8, x14 +; VBITS_EQ_128-NEXT: fmov d0, x11 +; VBITS_EQ_128-NEXT: fmov d1, x10 +; VBITS_EQ_128-NEXT: fmov d2, x9 +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: smulh_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -465,6 +1398,52 @@ } define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v8i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: fmov x14, d0 +; VBITS_EQ_128-NEXT: mov x13, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x12, d1 +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: fmov x17, d4 +; VBITS_EQ_128-NEXT: mov x15, v4.d[1] +; VBITS_EQ_128-NEXT: ldp q3, q1, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d5 +; VBITS_EQ_128-NEXT: smulh x14, x14, x17 +; VBITS_EQ_128-NEXT: mov x18, v5.d[1] +; VBITS_EQ_128-NEXT: smulh x13, x13, x15 +; VBITS_EQ_128-NEXT: fmov x15, d2 +; VBITS_EQ_128-NEXT: smulh x12, x12, x1 +; VBITS_EQ_128-NEXT: mov x1, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d1 +; VBITS_EQ_128-NEXT: smulh x11, x11, x18 +; VBITS_EQ_128-NEXT: mov x16, v1.d[1] +; VBITS_EQ_128-NEXT: fmov d2, x13 +; VBITS_EQ_128-NEXT: fmov d5, x12 +; VBITS_EQ_128-NEXT: smulh x9, x9, x17 +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: smulh x10, x10, x1 +; VBITS_EQ_128-NEXT: fmov d3, x14 +; VBITS_EQ_128-NEXT: smulh x8, x8, x16 +; VBITS_EQ_128-NEXT: fmov d4, x11 +; VBITS_EQ_128-NEXT: smulh x15, x15, x17 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d6, x10 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q1, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: smulh_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -485,6 +1464,102 @@ } define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v16i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -32 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d2 +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: mov x14, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d4 +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d5 +; VBITS_EQ_128-NEXT: fmov x5, d0 +; VBITS_EQ_128-NEXT: mov x4, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64] +; VBITS_EQ_128-NEXT: mov x3, v1.d[1] +; VBITS_EQ_128-NEXT: mov x18, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96] +; VBITS_EQ_128-NEXT: mov x16, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: fmov x19, d5 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64] +; VBITS_EQ_128-NEXT: mov x20, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d6 +; VBITS_EQ_128-NEXT: smulh x5, x5, x19 +; VBITS_EQ_128-NEXT: smulh x4, x4, x6 +; VBITS_EQ_128-NEXT: mov x19, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x6, d4 +; VBITS_EQ_128-NEXT: smulh x3, x3, x20 +; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32] +; VBITS_EQ_128-NEXT: fmov x20, d7 +; VBITS_EQ_128-NEXT: smulh x2, x2, x6 +; VBITS_EQ_128-NEXT: smulh x18, x18, x19 +; VBITS_EQ_128-NEXT: fmov d18, x4 +; VBITS_EQ_128-NEXT: fmov d19, x5 +; VBITS_EQ_128-NEXT: fmov d20, x3 +; VBITS_EQ_128-NEXT: smulh x17, x17, x20 +; VBITS_EQ_128-NEXT: fmov x19, d3 +; VBITS_EQ_128-NEXT: fmov d23, x2 +; VBITS_EQ_128-NEXT: ldp q2, q17, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d1 +; VBITS_EQ_128-NEXT: fmov x20, d16 +; VBITS_EQ_128-NEXT: smulh x15, x15, x19 +; VBITS_EQ_128-NEXT: fmov d22, x18 +; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0] +; VBITS_EQ_128-NEXT: smulh x1, x1, x21 +; VBITS_EQ_128-NEXT: mov x21, v7.d[1] +; VBITS_EQ_128-NEXT: smulh x13, x13, x20 +; VBITS_EQ_128-NEXT: mov x7, v17.d[1] +; VBITS_EQ_128-NEXT: mov x6, v2.d[1] +; VBITS_EQ_128-NEXT: mov x20, v16.d[1] +; VBITS_EQ_128-NEXT: smulh x16, x16, x21 +; VBITS_EQ_128-NEXT: fmov x21, d2 +; VBITS_EQ_128-NEXT: fmov x19, d17 +; VBITS_EQ_128-NEXT: smulh x8, x8, x7 +; VBITS_EQ_128-NEXT: smulh x10, x10, x6 +; VBITS_EQ_128-NEXT: fmov d5, x13 +; VBITS_EQ_128-NEXT: smulh x11, x11, x21 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov x21, v3.d[1] +; VBITS_EQ_128-NEXT: smulh x9, x9, x19 +; VBITS_EQ_128-NEXT: smulh x12, x12, x20 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d2, x10 +; VBITS_EQ_128-NEXT: fmov d16, x16 +; VBITS_EQ_128-NEXT: fmov d3, x11 +; VBITS_EQ_128-NEXT: fmov d17, x17 +; VBITS_EQ_128-NEXT: smulh x14, x14, x21 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d4, x12 +; VBITS_EQ_128-NEXT: fmov d21, x1 +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0] +; VBITS_EQ_128-NEXT: fmov d6, x14 +; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64] +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q3, q1, [x0] +; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -505,6 +1580,228 @@ } define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #224 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -24 +; VBITS_EQ_128-NEXT: .cfi_offset w22, -32 +; VBITS_EQ_128-NEXT: .cfi_offset w23, -40 +; VBITS_EQ_128-NEXT: .cfi_offset w24, -48 +; VBITS_EQ_128-NEXT: .cfi_offset w25, -56 +; VBITS_EQ_128-NEXT: .cfi_offset w26, -64 +; VBITS_EQ_128-NEXT: .cfi_offset w27, -72 +; VBITS_EQ_128-NEXT: .cfi_offset w28, -80 +; VBITS_EQ_128-NEXT: .cfi_offset w30, -88 +; VBITS_EQ_128-NEXT: .cfi_offset w29, -96 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -104 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -112 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -120 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -128 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -136 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -144 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -152 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -160 +; VBITS_EQ_128-NEXT: ldp q3, q2, [x0] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x8, d3 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x7, d5 +; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96] +; VBITS_EQ_128-NEXT: mov x20, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d4 +; VBITS_EQ_128-NEXT: mov x23, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x24, d6 +; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128] +; VBITS_EQ_128-NEXT: mov x26, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: mov x28, v16.d[1] +; VBITS_EQ_128-NEXT: fmov x25, d16 +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224] +; VBITS_EQ_128-NEXT: mov x22, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x19, d4 +; VBITS_EQ_128-NEXT: mov x13, v7.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d7 +; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d5 +; VBITS_EQ_128-NEXT: mov x17, v17.d[1] +; VBITS_EQ_128-NEXT: fmov x16, d17 +; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160] +; VBITS_EQ_128-NEXT: mov x15, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x14, d6 +; VBITS_EQ_128-NEXT: mov x5, v18.d[1] +; VBITS_EQ_128-NEXT: fmov x4, d18 +; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224] +; VBITS_EQ_128-NEXT: mov x29, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x18, d3 +; VBITS_EQ_128-NEXT: fmov x8, d19 +; VBITS_EQ_128-NEXT: mov x9, v19.d[1] +; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192] +; VBITS_EQ_128-NEXT: mov x30, v16.d[1] +; VBITS_EQ_128-NEXT: smulh x8, x11, x8 +; VBITS_EQ_128-NEXT: smulh x11, x13, x9 +; VBITS_EQ_128-NEXT: fmov x9, d21 +; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96] +; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64] +; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d16 +; VBITS_EQ_128-NEXT: smulh x10, x10, x1 +; VBITS_EQ_128-NEXT: mov x1, v20.d[1] +; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: smulh x10, x12, x30 +; VBITS_EQ_128-NEXT: mov x30, v21.d[1] +; VBITS_EQ_128-NEXT: fmov x3, d1 +; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x10, d20 +; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x8, x14, x10 +; VBITS_EQ_128-NEXT: smulh x10, x15, x1 +; VBITS_EQ_128-NEXT: fmov x15, d18 +; VBITS_EQ_128-NEXT: smulh x14, x16, x9 +; VBITS_EQ_128-NEXT: mov x9, v22.d[1] +; VBITS_EQ_128-NEXT: smulh x16, x17, x30 +; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x17, d22 +; VBITS_EQ_128-NEXT: mov x8, v18.d[1] +; VBITS_EQ_128-NEXT: smulh x18, x18, x15 +; VBITS_EQ_128-NEXT: mov x15, v23.d[1] +; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: smulh x4, x4, x17 +; VBITS_EQ_128-NEXT: fmov d8, x16 +; VBITS_EQ_128-NEXT: mov x17, v24.d[1] +; VBITS_EQ_128-NEXT: smulh x5, x5, x9 +; VBITS_EQ_128-NEXT: smulh x1, x29, x8 +; VBITS_EQ_128-NEXT: fmov x8, d23 +; VBITS_EQ_128-NEXT: fmov x9, d24 +; VBITS_EQ_128-NEXT: smulh x22, x22, x15 +; VBITS_EQ_128-NEXT: fmov x15, d17 +; VBITS_EQ_128-NEXT: fmov d9, x14 +; VBITS_EQ_128-NEXT: smulh x19, x19, x8 +; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x8, v17.d[1] +; VBITS_EQ_128-NEXT: smulh x25, x25, x9 +; VBITS_EQ_128-NEXT: mov x9, v25.d[1] +; VBITS_EQ_128-NEXT: smulh x28, x28, x17 +; VBITS_EQ_128-NEXT: fmov x17, d25 +; VBITS_EQ_128-NEXT: smulh x15, x27, x15 +; VBITS_EQ_128-NEXT: mov x27, v6.d[1] +; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x12, x26, x8 +; VBITS_EQ_128-NEXT: fmov x26, d6 +; VBITS_EQ_128-NEXT: smulh x17, x24, x17 +; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x24, v26.d[1] +; VBITS_EQ_128-NEXT: smulh x11, x23, x9 +; VBITS_EQ_128-NEXT: fmov x23, d26 +; VBITS_EQ_128-NEXT: smulh x21, x21, x26 +; VBITS_EQ_128-NEXT: fmov x26, d0 +; VBITS_EQ_128-NEXT: smulh x20, x20, x27 +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: fmov d20, x17 +; VBITS_EQ_128-NEXT: smulh x7, x7, x23 +; VBITS_EQ_128-NEXT: fmov x23, d4 +; VBITS_EQ_128-NEXT: smulh x6, x6, x24 +; VBITS_EQ_128-NEXT: fmov x24, d5 +; VBITS_EQ_128-NEXT: smulh x26, x26, x27 +; VBITS_EQ_128-NEXT: fmov x27, d7 +; VBITS_EQ_128-NEXT: smulh x3, x3, x23 +; VBITS_EQ_128-NEXT: fmov d19, x20 +; VBITS_EQ_128-NEXT: mov x23, v2.d[1] +; VBITS_EQ_128-NEXT: smulh x2, x2, x24 +; VBITS_EQ_128-NEXT: mov x24, v1.d[1] +; VBITS_EQ_128-NEXT: smulh x27, x8, x27 +; VBITS_EQ_128-NEXT: mov x29, v0.d[1] +; VBITS_EQ_128-NEXT: mov x30, v7.d[1] +; VBITS_EQ_128-NEXT: mov x8, v5.d[1] +; VBITS_EQ_128-NEXT: mov x9, v4.d[1] +; VBITS_EQ_128-NEXT: mov x10, v3.d[1] +; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x30, x13, x30 +; VBITS_EQ_128-NEXT: fmov d0, x27 +; VBITS_EQ_128-NEXT: smulh x8, x23, x8 +; VBITS_EQ_128-NEXT: fmov d2, x2 +; VBITS_EQ_128-NEXT: smulh x9, x24, x9 +; VBITS_EQ_128-NEXT: fmov d4, x3 +; VBITS_EQ_128-NEXT: smulh x10, x29, x10 +; VBITS_EQ_128-NEXT: fmov d6, x26 +; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0] +; VBITS_EQ_128-NEXT: fmov d1, x30 +; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0] +; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0] +; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0] +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: fmov d5, x9 +; VBITS_EQ_128-NEXT: fmov d7, x10 +; VBITS_EQ_128-NEXT: fmov d17, x6 +; VBITS_EQ_128-NEXT: fmov d16, x7 +; VBITS_EQ_128-NEXT: fmov d18, x21 +; VBITS_EQ_128-NEXT: fmov d21, x11 +; VBITS_EQ_128-NEXT: fmov d22, x12 +; VBITS_EQ_128-NEXT: fmov d23, x15 +; VBITS_EQ_128-NEXT: fmov d24, x28 +; VBITS_EQ_128-NEXT: fmov d25, x25 +; VBITS_EQ_128-NEXT: fmov d26, x22 +; VBITS_EQ_128-NEXT: fmov d27, x19 +; VBITS_EQ_128-NEXT: fmov d28, x5 +; VBITS_EQ_128-NEXT: fmov d29, x4 +; VBITS_EQ_128-NEXT: fmov d30, x1 +; VBITS_EQ_128-NEXT: fmov d31, x18 +; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0] +; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192] +; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224] +; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0] +; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0] +; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0] +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0] +; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0] +; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160] +; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0] +; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128] +; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0] +; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0] +; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32] +; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #224 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -531,6 +1828,12 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v8i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b @@ -563,6 +1866,13 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v16i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull2 v2.8h, v0.16b, v1.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b @@ -586,6 +1896,29 @@ ; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v32i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: umull v4.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: umull2 v0.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v5.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: umull2 v1.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: umulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %1 = zext <32 x i8> %op1 to <32 x i16> @@ -598,6 +1931,32 @@ } define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v64i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v16.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v3.8h, v4.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: umulh_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -618,6 +1977,54 @@ } define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v128i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v17.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v19.8h, v4.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v21.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: umull v3.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: umull2 v23.8h, v18.16b, v20.16b +; VBITS_EQ_128-NEXT: umull v18.8h, v18.8b, v20.8b +; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8 +; VBITS_EQ_128-NEXT: umull v20.8h, v16.8b, v5.8b +; VBITS_EQ_128-NEXT: umull2 v5.8h, v16.16b, v5.16b +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull v25.8h, v22.8b, v24.8b +; VBITS_EQ_128-NEXT: umull2 v16.8h, v22.16b, v24.16b +; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8 +; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -641,6 +2048,121 @@ } define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v256i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v0.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: umull2 v0.8h, v2.16b, v6.16b +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: umull v6.8h, v2.8b, v6.8b +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: umull2 v2.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8 +; VBITS_EQ_128-NEXT: umull v5.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: umull2 v3.8h, v7.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v7.8h, v7.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v31.8h, v19.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v9.8h, v19.8b, v16.8b +; VBITS_EQ_128-NEXT: umull2 v21.8h, v18.16b, v17.16b +; VBITS_EQ_128-NEXT: umull v30.8h, v18.8b, v17.8b +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8 +; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: umull2 v16.8h, v17.16b, v20.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v18.8h, v17.8b, v20.8b +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: umull2 v17.8h, v22.16b, v19.16b +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: umull v19.8h, v22.8b, v19.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v12.8h, v24.16b, v22.16b +; VBITS_EQ_128-NEXT: umull v13.8h, v24.8b, v22.8b +; VBITS_EQ_128-NEXT: umull2 v10.8h, v20.16b, v23.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v11.8h, v20.8b, v23.8b +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: umull2 v22.8h, v26.16b, v24.16b +; VBITS_EQ_128-NEXT: umull v24.8h, v26.8b, v24.8b +; VBITS_EQ_128-NEXT: umull2 v20.8h, v23.16b, v25.16b +; VBITS_EQ_128-NEXT: umull v23.8h, v23.8b, v25.8b +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v15.8h, v28.16b, v26.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v28.8b, v26.8b +; VBITS_EQ_128-NEXT: umull2 v14.8h, v25.16b, v27.16b +; VBITS_EQ_128-NEXT: umull v8.8h, v25.8b, v27.8b +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v26.8h, v0.16b, v28.16b +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8 +; VBITS_EQ_128-NEXT: umull v28.8h, v0.8b, v28.8b +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: umull2 v25.8h, v27.16b, v29.16b +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: umull v27.8h, v27.8b, v29.8b +; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -663,6 +2185,12 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v4i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h @@ -686,6 +2214,13 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v8i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull2 v2.4s, v0.8h, v1.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h @@ -709,6 +2244,29 @@ ; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v16i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: umull v4.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: umull2 v0.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v5.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: umull2 v1.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: umulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %1 = zext <16 x i16> %op1 to <16 x i32> @@ -721,6 +2279,32 @@ } define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v16.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v3.4s, v4.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: umulh_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -741,6 +2325,54 @@ } define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v64i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v17.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v19.4s, v4.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v21.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: umull v3.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: umull2 v23.4s, v18.8h, v20.8h +; VBITS_EQ_128-NEXT: umull v18.4s, v18.4h, v20.4h +; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16 +; VBITS_EQ_128-NEXT: umull v20.4s, v16.4h, v5.4h +; VBITS_EQ_128-NEXT: umull2 v5.4s, v16.8h, v5.8h +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull v25.4s, v22.4h, v24.4h +; VBITS_EQ_128-NEXT: umull2 v16.4s, v22.8h, v24.8h +; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16 +; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -761,6 +2393,121 @@ } define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v128i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v0.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: umull2 v0.4s, v2.8h, v6.8h +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: umull v6.4s, v2.4h, v6.4h +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: umull2 v2.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16 +; VBITS_EQ_128-NEXT: umull v5.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: umull2 v3.4s, v7.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v7.4s, v7.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v31.4s, v19.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v9.4s, v19.4h, v16.4h +; VBITS_EQ_128-NEXT: umull2 v21.4s, v18.8h, v17.8h +; VBITS_EQ_128-NEXT: umull v30.4s, v18.4h, v17.4h +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16 +; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: umull2 v16.4s, v17.8h, v20.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v18.4s, v17.4h, v20.4h +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: umull2 v17.4s, v22.8h, v19.8h +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: umull v19.4s, v22.4h, v19.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v12.4s, v24.8h, v22.8h +; VBITS_EQ_128-NEXT: umull v13.4s, v24.4h, v22.4h +; VBITS_EQ_128-NEXT: umull2 v10.4s, v20.8h, v23.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v11.4s, v20.4h, v23.4h +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: umull2 v22.4s, v26.8h, v24.8h +; VBITS_EQ_128-NEXT: umull v24.4s, v26.4h, v24.4h +; VBITS_EQ_128-NEXT: umull2 v20.4s, v23.8h, v25.8h +; VBITS_EQ_128-NEXT: umull v23.4s, v23.4h, v25.4h +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v15.4s, v28.8h, v26.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v28.4h, v26.4h +; VBITS_EQ_128-NEXT: umull2 v14.4s, v25.8h, v27.8h +; VBITS_EQ_128-NEXT: umull v8.4s, v25.4h, v27.4h +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v26.4s, v0.8h, v28.8h +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16 +; VBITS_EQ_128-NEXT: umull v28.4s, v0.4h, v28.4h +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: umull2 v25.4s, v27.8h, v29.8h +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: umull v27.4s, v27.4h, v29.4h +; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -782,6 +2529,15 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v2i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 @@ -791,13 +2547,6 @@ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret -; VBITS_EQ_128-LABEL: umulh_v2i32: -; VBITS_EQ_128: ushll v0.2d, v0.2s, #0 -; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 -; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0 -; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 -; VBITS_EQ_128-NEXT: ret %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> @@ -809,6 +2558,13 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v4i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull2 v2.2d, v0.4s, v1.4s +; VBITS_EQ_128-NEXT: umull v0.2d, v0.2s, v1.2s +; VBITS_EQ_128-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s @@ -832,6 +2588,38 @@ ; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v8i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v5.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: ushll v4.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ushll v7.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v6.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; +; CHECK-LABEL: umulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -846,6 +2634,49 @@ } define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v16i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v19.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: ushll v18.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v7.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32] +; VBITS_EQ_128-NEXT: ushll v0.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v4.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v21.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ushll v5.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: ushll2 v22.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d +; VBITS_EQ_128-NEXT: ushll v6.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d +; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d +; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q0, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: umulh_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -866,6 +2697,95 @@ } define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -32 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v27.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v29.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96] +; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v22.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v31.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v8.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96] +; VBITS_EQ_128-NEXT: ushll v30.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v28.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v9.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: ushll v26.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64] +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d +; VBITS_EQ_128-NEXT: ushll2 v10.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ushll v25.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v31.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d +; VBITS_EQ_128-NEXT: ushll v24.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d +; VBITS_EQ_128-NEXT: ushll2 v30.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: ushll v21.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: ushll v6.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d +; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d +; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32] +; VBITS_EQ_128-NEXT: ushll v4.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v5.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v7.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v27.2d, v20.4s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v20.2s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q1, [x0] +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d +; VBITS_EQ_128-NEXT: ushll2 v21.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ushll v18.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: ushll v2.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: ushll v0.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: ushll v20.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v23.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d +; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64] +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32] +; VBITS_EQ_128-NEXT: stp q2, q0, [x0] +; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96] +; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -886,6 +2806,267 @@ } define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v64i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80 +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -64 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -72 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -80 +; VBITS_EQ_128-NEXT: addvl sp, sp, #-12 +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48] +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldr q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldr q3, [x0, #80] +; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ushll v1.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128] +; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160] +; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v27.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192] +; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v9.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v12.2d, v30.4s, #0 +; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ushll v11.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v8.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll v10.2d, v31.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v15.2d, v31.4s, #0 +; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224] +; VBITS_EQ_128-NEXT: ushll2 v18.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ushll v31.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192] +; VBITS_EQ_128-NEXT: ushll v1.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v19.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d +; VBITS_EQ_128-NEXT: ldp q21, q22, [x0] +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d +; VBITS_EQ_128-NEXT: ushll v18.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v14.2s, #0 +; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96] +; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64] +; VBITS_EQ_128-NEXT: movprfx z0, z31 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z0, z15 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v30.2s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32] +; VBITS_EQ_128-NEXT: movprfx z15, z10 +; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d +; VBITS_EQ_128-NEXT: movprfx z0, z8 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d +; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v14.4s, #0 +; VBITS_EQ_128-NEXT: ldp q19, q18, [x1] +; VBITS_EQ_128-NEXT: movprfx z10, z12 +; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d +; VBITS_EQ_128-NEXT: movprfx z8, z1 +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v13.4s, #0 +; VBITS_EQ_128-NEXT: ushll v12.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v13.2s, #0 +; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d +; VBITS_EQ_128-NEXT: ushll v20.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z13, z20 +; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z14, z6 +; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d +; VBITS_EQ_128-NEXT: ushll v4.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z30, z4 +; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll v1.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z31, z4 +; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d +; VBITS_EQ_128-NEXT: ushll v6.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z28, z6 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z23, z3 +; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z20, z5 +; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: ushll v4.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z7, z1 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll v3.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z6, z3 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v5.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z26, z1 +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll v3.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z24, z5 +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d +; VBITS_EQ_128-NEXT: ushll v16.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z25, z1 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z29, z16 +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z4, z1 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v22.2s, #0 +; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v22.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z22, z0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d +; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll v1.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll v3.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: ushll2 v18.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: movprfx z2, z5 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d +; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d +; VBITS_EQ_128-NEXT: ushll2 v5.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v19.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v19.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32 +; VBITS_EQ_128-NEXT: ushll v3.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32 +; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32 +; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32 +; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32 +; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32 +; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32] +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32 +; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32 +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224] +; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32 +; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128] +; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64] +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192] +; VBITS_EQ_128-NEXT: addvl sp, sp, #12 +; VBITS_EQ_128-NEXT: add sp, sp, #80 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -907,6 +3088,16 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v1i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1 +; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_EQ_128-NEXT: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -925,6 +3116,19 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; VBITS_EQ_128-LABEL: umulh_v2i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: umulh x10, x10, x11 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -942,6 +3146,31 @@ } define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v4i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: mov x10, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: mov x8, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mov x12, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d2 +; VBITS_EQ_128-NEXT: mov x14, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d3 +; VBITS_EQ_128-NEXT: umulh x11, x11, x13 +; VBITS_EQ_128-NEXT: umulh x10, x10, x12 +; VBITS_EQ_128-NEXT: umulh x9, x9, x15 +; VBITS_EQ_128-NEXT: umulh x8, x8, x14 +; VBITS_EQ_128-NEXT: fmov d0, x11 +; VBITS_EQ_128-NEXT: fmov d1, x10 +; VBITS_EQ_128-NEXT: fmov d2, x9 +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; ; CHECK-LABEL: umulh_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -962,6 +3191,52 @@ } define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v8i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: fmov x14, d0 +; VBITS_EQ_128-NEXT: mov x13, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x12, d1 +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: fmov x17, d4 +; VBITS_EQ_128-NEXT: mov x15, v4.d[1] +; VBITS_EQ_128-NEXT: ldp q3, q1, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d5 +; VBITS_EQ_128-NEXT: umulh x14, x14, x17 +; VBITS_EQ_128-NEXT: mov x18, v5.d[1] +; VBITS_EQ_128-NEXT: umulh x13, x13, x15 +; VBITS_EQ_128-NEXT: fmov x15, d2 +; VBITS_EQ_128-NEXT: umulh x12, x12, x1 +; VBITS_EQ_128-NEXT: mov x1, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d1 +; VBITS_EQ_128-NEXT: umulh x11, x11, x18 +; VBITS_EQ_128-NEXT: mov x16, v1.d[1] +; VBITS_EQ_128-NEXT: fmov d2, x13 +; VBITS_EQ_128-NEXT: fmov d5, x12 +; VBITS_EQ_128-NEXT: umulh x9, x9, x17 +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: umulh x10, x10, x1 +; VBITS_EQ_128-NEXT: fmov d3, x14 +; VBITS_EQ_128-NEXT: umulh x8, x8, x16 +; VBITS_EQ_128-NEXT: fmov d4, x11 +; VBITS_EQ_128-NEXT: umulh x15, x15, x17 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d6, x10 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q1, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_512-LABEL: umulh_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -982,6 +3257,102 @@ } define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v16i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -32 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d2 +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: mov x14, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d4 +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d5 +; VBITS_EQ_128-NEXT: fmov x5, d0 +; VBITS_EQ_128-NEXT: mov x4, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64] +; VBITS_EQ_128-NEXT: mov x3, v1.d[1] +; VBITS_EQ_128-NEXT: mov x18, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96] +; VBITS_EQ_128-NEXT: mov x16, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: fmov x19, d5 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64] +; VBITS_EQ_128-NEXT: mov x20, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d6 +; VBITS_EQ_128-NEXT: umulh x5, x5, x19 +; VBITS_EQ_128-NEXT: umulh x4, x4, x6 +; VBITS_EQ_128-NEXT: mov x19, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x6, d4 +; VBITS_EQ_128-NEXT: umulh x3, x3, x20 +; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32] +; VBITS_EQ_128-NEXT: fmov x20, d7 +; VBITS_EQ_128-NEXT: umulh x2, x2, x6 +; VBITS_EQ_128-NEXT: umulh x18, x18, x19 +; VBITS_EQ_128-NEXT: fmov d18, x4 +; VBITS_EQ_128-NEXT: fmov d19, x5 +; VBITS_EQ_128-NEXT: fmov d20, x3 +; VBITS_EQ_128-NEXT: umulh x17, x17, x20 +; VBITS_EQ_128-NEXT: fmov x19, d3 +; VBITS_EQ_128-NEXT: fmov d23, x2 +; VBITS_EQ_128-NEXT: ldp q2, q17, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d1 +; VBITS_EQ_128-NEXT: fmov x20, d16 +; VBITS_EQ_128-NEXT: umulh x15, x15, x19 +; VBITS_EQ_128-NEXT: fmov d22, x18 +; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0] +; VBITS_EQ_128-NEXT: umulh x1, x1, x21 +; VBITS_EQ_128-NEXT: mov x21, v7.d[1] +; VBITS_EQ_128-NEXT: umulh x13, x13, x20 +; VBITS_EQ_128-NEXT: mov x7, v17.d[1] +; VBITS_EQ_128-NEXT: mov x6, v2.d[1] +; VBITS_EQ_128-NEXT: mov x20, v16.d[1] +; VBITS_EQ_128-NEXT: umulh x16, x16, x21 +; VBITS_EQ_128-NEXT: fmov x21, d2 +; VBITS_EQ_128-NEXT: fmov x19, d17 +; VBITS_EQ_128-NEXT: umulh x8, x8, x7 +; VBITS_EQ_128-NEXT: umulh x10, x10, x6 +; VBITS_EQ_128-NEXT: fmov d5, x13 +; VBITS_EQ_128-NEXT: umulh x11, x11, x21 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov x21, v3.d[1] +; VBITS_EQ_128-NEXT: umulh x9, x9, x19 +; VBITS_EQ_128-NEXT: umulh x12, x12, x20 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d2, x10 +; VBITS_EQ_128-NEXT: fmov d16, x16 +; VBITS_EQ_128-NEXT: fmov d3, x11 +; VBITS_EQ_128-NEXT: fmov d17, x17 +; VBITS_EQ_128-NEXT: umulh x14, x14, x21 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d4, x12 +; VBITS_EQ_128-NEXT: fmov d21, x1 +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0] +; VBITS_EQ_128-NEXT: fmov d6, x14 +; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64] +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q3, q1, [x0] +; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1002,6 +3373,228 @@ } define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #224 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -24 +; VBITS_EQ_128-NEXT: .cfi_offset w22, -32 +; VBITS_EQ_128-NEXT: .cfi_offset w23, -40 +; VBITS_EQ_128-NEXT: .cfi_offset w24, -48 +; VBITS_EQ_128-NEXT: .cfi_offset w25, -56 +; VBITS_EQ_128-NEXT: .cfi_offset w26, -64 +; VBITS_EQ_128-NEXT: .cfi_offset w27, -72 +; VBITS_EQ_128-NEXT: .cfi_offset w28, -80 +; VBITS_EQ_128-NEXT: .cfi_offset w30, -88 +; VBITS_EQ_128-NEXT: .cfi_offset w29, -96 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -104 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -112 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -120 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -128 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -136 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -144 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -152 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -160 +; VBITS_EQ_128-NEXT: ldp q3, q2, [x0] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x8, d3 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x7, d5 +; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96] +; VBITS_EQ_128-NEXT: mov x20, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d4 +; VBITS_EQ_128-NEXT: mov x23, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x24, d6 +; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128] +; VBITS_EQ_128-NEXT: mov x26, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: mov x28, v16.d[1] +; VBITS_EQ_128-NEXT: fmov x25, d16 +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224] +; VBITS_EQ_128-NEXT: mov x22, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x19, d4 +; VBITS_EQ_128-NEXT: mov x13, v7.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d7 +; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d5 +; VBITS_EQ_128-NEXT: mov x17, v17.d[1] +; VBITS_EQ_128-NEXT: fmov x16, d17 +; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160] +; VBITS_EQ_128-NEXT: mov x15, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x14, d6 +; VBITS_EQ_128-NEXT: mov x5, v18.d[1] +; VBITS_EQ_128-NEXT: fmov x4, d18 +; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224] +; VBITS_EQ_128-NEXT: mov x29, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x18, d3 +; VBITS_EQ_128-NEXT: fmov x8, d19 +; VBITS_EQ_128-NEXT: mov x9, v19.d[1] +; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192] +; VBITS_EQ_128-NEXT: mov x30, v16.d[1] +; VBITS_EQ_128-NEXT: umulh x8, x11, x8 +; VBITS_EQ_128-NEXT: umulh x11, x13, x9 +; VBITS_EQ_128-NEXT: fmov x9, d21 +; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96] +; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64] +; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d16 +; VBITS_EQ_128-NEXT: umulh x10, x10, x1 +; VBITS_EQ_128-NEXT: mov x1, v20.d[1] +; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: umulh x10, x12, x30 +; VBITS_EQ_128-NEXT: mov x30, v21.d[1] +; VBITS_EQ_128-NEXT: fmov x3, d1 +; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x10, d20 +; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x8, x14, x10 +; VBITS_EQ_128-NEXT: umulh x10, x15, x1 +; VBITS_EQ_128-NEXT: fmov x15, d18 +; VBITS_EQ_128-NEXT: umulh x14, x16, x9 +; VBITS_EQ_128-NEXT: mov x9, v22.d[1] +; VBITS_EQ_128-NEXT: umulh x16, x17, x30 +; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x17, d22 +; VBITS_EQ_128-NEXT: mov x8, v18.d[1] +; VBITS_EQ_128-NEXT: umulh x18, x18, x15 +; VBITS_EQ_128-NEXT: mov x15, v23.d[1] +; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: umulh x4, x4, x17 +; VBITS_EQ_128-NEXT: fmov d8, x16 +; VBITS_EQ_128-NEXT: mov x17, v24.d[1] +; VBITS_EQ_128-NEXT: umulh x5, x5, x9 +; VBITS_EQ_128-NEXT: umulh x1, x29, x8 +; VBITS_EQ_128-NEXT: fmov x8, d23 +; VBITS_EQ_128-NEXT: fmov x9, d24 +; VBITS_EQ_128-NEXT: umulh x22, x22, x15 +; VBITS_EQ_128-NEXT: fmov x15, d17 +; VBITS_EQ_128-NEXT: fmov d9, x14 +; VBITS_EQ_128-NEXT: umulh x19, x19, x8 +; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x8, v17.d[1] +; VBITS_EQ_128-NEXT: umulh x25, x25, x9 +; VBITS_EQ_128-NEXT: mov x9, v25.d[1] +; VBITS_EQ_128-NEXT: umulh x28, x28, x17 +; VBITS_EQ_128-NEXT: fmov x17, d25 +; VBITS_EQ_128-NEXT: umulh x15, x27, x15 +; VBITS_EQ_128-NEXT: mov x27, v6.d[1] +; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x12, x26, x8 +; VBITS_EQ_128-NEXT: fmov x26, d6 +; VBITS_EQ_128-NEXT: umulh x17, x24, x17 +; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x24, v26.d[1] +; VBITS_EQ_128-NEXT: umulh x11, x23, x9 +; VBITS_EQ_128-NEXT: fmov x23, d26 +; VBITS_EQ_128-NEXT: umulh x21, x21, x26 +; VBITS_EQ_128-NEXT: fmov x26, d0 +; VBITS_EQ_128-NEXT: umulh x20, x20, x27 +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: fmov d20, x17 +; VBITS_EQ_128-NEXT: umulh x7, x7, x23 +; VBITS_EQ_128-NEXT: fmov x23, d4 +; VBITS_EQ_128-NEXT: umulh x6, x6, x24 +; VBITS_EQ_128-NEXT: fmov x24, d5 +; VBITS_EQ_128-NEXT: umulh x26, x26, x27 +; VBITS_EQ_128-NEXT: fmov x27, d7 +; VBITS_EQ_128-NEXT: umulh x3, x3, x23 +; VBITS_EQ_128-NEXT: fmov d19, x20 +; VBITS_EQ_128-NEXT: mov x23, v2.d[1] +; VBITS_EQ_128-NEXT: umulh x2, x2, x24 +; VBITS_EQ_128-NEXT: mov x24, v1.d[1] +; VBITS_EQ_128-NEXT: umulh x27, x8, x27 +; VBITS_EQ_128-NEXT: mov x29, v0.d[1] +; VBITS_EQ_128-NEXT: mov x30, v7.d[1] +; VBITS_EQ_128-NEXT: mov x8, v5.d[1] +; VBITS_EQ_128-NEXT: mov x9, v4.d[1] +; VBITS_EQ_128-NEXT: mov x10, v3.d[1] +; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x30, x13, x30 +; VBITS_EQ_128-NEXT: fmov d0, x27 +; VBITS_EQ_128-NEXT: umulh x8, x23, x8 +; VBITS_EQ_128-NEXT: fmov d2, x2 +; VBITS_EQ_128-NEXT: umulh x9, x24, x9 +; VBITS_EQ_128-NEXT: fmov d4, x3 +; VBITS_EQ_128-NEXT: umulh x10, x29, x10 +; VBITS_EQ_128-NEXT: fmov d6, x26 +; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0] +; VBITS_EQ_128-NEXT: fmov d1, x30 +; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0] +; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0] +; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0] +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: fmov d5, x9 +; VBITS_EQ_128-NEXT: fmov d7, x10 +; VBITS_EQ_128-NEXT: fmov d17, x6 +; VBITS_EQ_128-NEXT: fmov d16, x7 +; VBITS_EQ_128-NEXT: fmov d18, x21 +; VBITS_EQ_128-NEXT: fmov d21, x11 +; VBITS_EQ_128-NEXT: fmov d22, x12 +; VBITS_EQ_128-NEXT: fmov d23, x15 +; VBITS_EQ_128-NEXT: fmov d24, x28 +; VBITS_EQ_128-NEXT: fmov d25, x25 +; VBITS_EQ_128-NEXT: fmov d26, x22 +; VBITS_EQ_128-NEXT: fmov d27, x19 +; VBITS_EQ_128-NEXT: fmov d28, x5 +; VBITS_EQ_128-NEXT: fmov d29, x4 +; VBITS_EQ_128-NEXT: fmov d30, x1 +; VBITS_EQ_128-NEXT: fmov d31, x18 +; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0] +; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192] +; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224] +; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0] +; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0] +; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0] +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0] +; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0] +; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160] +; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0] +; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128] +; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0] +; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0] +; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32] +; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #224 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -26,6 +27,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v4i16_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.4h, v0.4h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i16_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.4h, v0.4h @@ -36,6 +42,13 @@ ; Don't use SVE for 128-bit vectors. define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i16_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ucvtf v0.8h, v0.8h +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v8i16_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -49,6 +62,14 @@ } define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i16_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.8h, v0.8h +; NO_SVE-NEXT: ucvtf v1.8h, v1.8h +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v16i16_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -63,6 +84,18 @@ } define void @ucvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i16_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: ucvtf v1.8h, v1.8h +; NO_SVE-NEXT: ucvtf v2.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: ucvtf v3.8h, v3.8h +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v32i16_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -89,6 +122,46 @@ } define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v64i16_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: ucvtf v1.8h, v1.8h +; NO_SVE-NEXT: ucvtf v2.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: ucvtf v3.8h, v3.8h +; NO_SVE-NEXT: ucvtf v4.8h, v4.8h +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: ucvtf v0.8h, v5.8h +; NO_SVE-NEXT: ucvtf v1.8h, v6.8h +; NO_SVE-NEXT: ucvtf v2.8h, v7.8h +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v64i16_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.h +; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.h +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z2.h +; VBITS_EQ_256-NEXT: ucvtf z3.h, p0/m, z3.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v64i16_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -103,6 +176,78 @@ } define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v128i16_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: ucvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: ucvtf v1.8h, v1.8h +; NO_SVE-NEXT: ucvtf v2.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: ucvtf v3.8h, v3.8h +; NO_SVE-NEXT: ucvtf v4.8h, v4.8h +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: ucvtf v2.8h, v7.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: ucvtf v0.8h, v5.8h +; NO_SVE-NEXT: ucvtf v1.8h, v6.8h +; NO_SVE-NEXT: ucvtf v3.8h, v16.8h +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: ucvtf v4.8h, v17.8h +; NO_SVE-NEXT: ucvtf v0.8h, v18.8h +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: ucvtf v1.8h, v19.8h +; NO_SVE-NEXT: ucvtf v2.8h, v20.8h +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: ucvtf v3.8h, v21.8h +; NO_SVE-NEXT: ucvtf v4.8h, v22.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: ucvtf v0.8h, v23.8h +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v128i16_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #96 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #112 +; VBITS_EQ_256-NEXT: mov x14, #64 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.h +; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.h +; VBITS_EQ_256-NEXT: ucvtf z3.h, p0/m, z3.h +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z2.h +; VBITS_EQ_256-NEXT: ucvtf z5.h, p0/m, z5.h +; VBITS_EQ_256-NEXT: ucvtf z4.h, p0/m, z4.h +; VBITS_EQ_256-NEXT: ucvtf z6.h, p0/m, z6.h +; VBITS_EQ_256-NEXT: ucvtf z7.h, p0/m, z7.h +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v128i16_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -122,6 +267,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i16_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi d1, #0x00ffff0000ffff +; NO_SVE-NEXT: and v0.8b, v0.8b, v1.8b +; NO_SVE-NEXT: ucvtf v0.2s, v0.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i16_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d1, #0x00ffff0000ffff @@ -134,6 +286,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v4i16_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 @@ -144,6 +302,16 @@ } define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i16_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v8i16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -159,6 +327,21 @@ } define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i16_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ushll v2.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v3.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: stp q3, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v16i16_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 @@ -190,6 +373,57 @@ } define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i16_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ushll v6.4s, v3.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v3.8h, #0 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ushll v7.4s, v2.4h, #0 +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: ushll2 v2.4s, v2.8h, #0 +; NO_SVE-NEXT: ucvtf v7.4s, v7.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: ucvtf v6.4s, v6.4s +; NO_SVE-NEXT: ushll v4.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll v5.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: stp q6, q3, [x1, #64] +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: ucvtf v2.4s, v5.4s +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v3.4s, v4.4s +; NO_SVE-NEXT: stp q2, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i16_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 @@ -206,6 +440,99 @@ } define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v64i16_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ushll v18.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: ucvtf v18.4s, v18.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ushll v19.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ucvtf v19.4s, v19.4s +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ushll v17.4s, v4.4h, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v4.8h, #0 +; NO_SVE-NEXT: ucvtf v17.4s, v17.4s +; NO_SVE-NEXT: ushll v6.4s, v3.4h, #0 +; NO_SVE-NEXT: ucvtf v4.4s, v4.4s +; NO_SVE-NEXT: ushll v16.4s, v5.4h, #0 +; NO_SVE-NEXT: ldp q21, q20, [x0] +; NO_SVE-NEXT: ushll v7.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q18, q1, [x1, #192] +; NO_SVE-NEXT: ushll2 v2.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q17, q4, [x1, #160] +; NO_SVE-NEXT: ushll2 v1.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q19, q0, [x1, #224] +; NO_SVE-NEXT: ucvtf v6.4s, v6.4s +; NO_SVE-NEXT: ucvtf v7.4s, v7.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v16.4s, v16.4s +; NO_SVE-NEXT: ushll2 v5.4s, v5.8h, #0 +; NO_SVE-NEXT: ushll v22.4s, v21.4h, #0 +; NO_SVE-NEXT: ushll v0.4s, v20.4h, #0 +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: ushll2 v3.4s, v20.8h, #0 +; NO_SVE-NEXT: stp q6, q1, [x1, #64] +; NO_SVE-NEXT: ushll2 v4.4s, v21.8h, #0 +; NO_SVE-NEXT: ucvtf v5.4s, v5.4s +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ucvtf v2.4s, v4.4s +; NO_SVE-NEXT: ucvtf v1.4s, v22.4s +; NO_SVE-NEXT: stp q16, q5, [x1, #128] +; NO_SVE-NEXT: stp q0, q3, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v64i16_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #32 +; VBITS_EQ_256-NEXT: mov x10, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: uunpklo z4.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z5.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z6.s, z2.h +; VBITS_EQ_256-NEXT: ucvtf z4.s, p0/m, z4.s +; VBITS_EQ_256-NEXT: ucvtf z5.s, p0/m, z5.s +; VBITS_EQ_256-NEXT: ucvtf z6.s, p0/m, z6.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z7.s, z3.h +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: mov x10, #40 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: ucvtf z7.s, p0/m, z7.s +; VBITS_EQ_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 @@ -227,6 +554,13 @@ ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v1i16_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: umov w8, v0.h[0] +; NO_SVE-NEXT: ucvtf d0, w8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v1i16_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -242,6 +576,14 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i16_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi d1, #0x00ffff0000ffff +; NO_SVE-NEXT: and v0.8b, v0.8b, v1.8b +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i16_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d1, #0x00ffff0000ffff @@ -254,6 +596,27 @@ } define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v4i16_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d1, [x0] +; NO_SVE-NEXT: movi d0, #0x00ffff0000ffff +; NO_SVE-NEXT: umov w8, v1.h[0] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: umov w10, v1.h[1] +; NO_SVE-NEXT: fmov s2, w8 +; NO_SVE-NEXT: umov w8, v1.h[3] +; NO_SVE-NEXT: fmov s1, w9 +; NO_SVE-NEXT: mov v2.s[1], w10 +; NO_SVE-NEXT: mov v1.s[1], w8 +; NO_SVE-NEXT: and v2.8b, v2.8b, v0.8b +; NO_SVE-NEXT: and v0.8b, v1.8b, v0.8b +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -270,6 +633,43 @@ } define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i16_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q1, [x0] +; NO_SVE-NEXT: movi d0, #0x00ffff0000ffff +; NO_SVE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: umov w8, v1.h[0] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: umov w11, v1.h[1] +; NO_SVE-NEXT: umov w10, v2.h[0] +; NO_SVE-NEXT: umov w12, v2.h[2] +; NO_SVE-NEXT: fmov s3, w8 +; NO_SVE-NEXT: umov w8, v1.h[3] +; NO_SVE-NEXT: fmov s1, w9 +; NO_SVE-NEXT: umov w9, v2.h[1] +; NO_SVE-NEXT: fmov s4, w10 +; NO_SVE-NEXT: umov w10, v2.h[3] +; NO_SVE-NEXT: fmov s2, w12 +; NO_SVE-NEXT: mov v3.s[1], w11 +; NO_SVE-NEXT: mov v1.s[1], w8 +; NO_SVE-NEXT: mov v4.s[1], w9 +; NO_SVE-NEXT: mov v2.s[1], w10 +; NO_SVE-NEXT: and v3.8b, v3.8b, v0.8b +; NO_SVE-NEXT: and v1.8b, v1.8b, v0.8b +; NO_SVE-NEXT: and v4.8b, v4.8b, v0.8b +; NO_SVE-NEXT: and v0.8b, v2.8b, v0.8b +; NO_SVE-NEXT: ushll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v4.2s, #0 +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: stp q2, q0, [x1, #32] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v8i16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] @@ -302,6 +702,104 @@ } define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i16_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: movi d0, #0x00ffff0000ffff +; NO_SVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: umov w8, v2.h[0] +; NO_SVE-NEXT: umov w10, v2.h[1] +; NO_SVE-NEXT: umov w9, v3.h[0] +; NO_SVE-NEXT: umov w11, v3.h[2] +; NO_SVE-NEXT: umov w12, v4.h[0] +; NO_SVE-NEXT: fmov s5, w8 +; NO_SVE-NEXT: umov w8, v3.h[1] +; NO_SVE-NEXT: fmov s6, w9 +; NO_SVE-NEXT: umov w9, v3.h[3] +; NO_SVE-NEXT: mov v5.s[1], w10 +; NO_SVE-NEXT: fmov s3, w11 +; NO_SVE-NEXT: umov w10, v4.h[1] +; NO_SVE-NEXT: fmov s7, w12 +; NO_SVE-NEXT: umov w11, v4.h[2] +; NO_SVE-NEXT: mov v3.s[1], w9 +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: mov v6.s[1], w8 +; NO_SVE-NEXT: mov v7.s[1], w10 +; NO_SVE-NEXT: umov w10, v2.h[2] +; NO_SVE-NEXT: fmov s17, w9 +; NO_SVE-NEXT: umov w9, v2.h[3] +; NO_SVE-NEXT: umov w8, v1.h[0] +; NO_SVE-NEXT: fmov s2, w10 +; NO_SVE-NEXT: umov w10, v1.h[3] +; NO_SVE-NEXT: fmov s16, w8 +; NO_SVE-NEXT: umov w8, v4.h[3] +; NO_SVE-NEXT: mov v2.s[1], w9 +; NO_SVE-NEXT: fmov s4, w11 +; NO_SVE-NEXT: umov w11, v1.h[1] +; NO_SVE-NEXT: and v1.8b, v5.8b, v0.8b +; NO_SVE-NEXT: and v5.8b, v7.8b, v0.8b +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: and v2.8b, v2.8b, v0.8b +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: mov v4.s[1], w8 +; NO_SVE-NEXT: mov v16.s[1], w11 +; NO_SVE-NEXT: mov v17.s[1], w10 +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: and v4.8b, v4.8b, v0.8b +; NO_SVE-NEXT: and v7.8b, v16.8b, v0.8b +; NO_SVE-NEXT: and v16.8b, v17.8b, v0.8b +; NO_SVE-NEXT: stp q1, q2, [x1, #64] +; NO_SVE-NEXT: and v1.8b, v6.8b, v0.8b +; NO_SVE-NEXT: and v0.8b, v3.8b, v0.8b +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll v4.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll v16.2d, v16.2s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ucvtf v2.2d, v5.2d +; NO_SVE-NEXT: ucvtf v3.2d, v16.2d +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: ucvtf v5.2d, v7.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q2, q4, [x1, #96] +; NO_SVE-NEXT: stp q5, q3, [x1] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v16i16_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov z1.d, z0.d +; VBITS_EQ_256-NEXT: uunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.s, z3.h +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: uunpklo z0.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -319,6 +817,192 @@ } define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i16_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q4, [x0] +; NO_SVE-NEXT: movi d0, #0x00ffff0000ffff +; NO_SVE-NEXT: ext v1.16b, v6.16b, v6.16b, #8 +; NO_SVE-NEXT: ext v16.16b, v4.16b, v4.16b, #8 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: umov w8, v1.h[2] +; NO_SVE-NEXT: umov w10, v1.h[3] +; NO_SVE-NEXT: umov w9, v16.h[2] +; NO_SVE-NEXT: umov w11, v16.h[3] +; NO_SVE-NEXT: ext v19.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: fmov s5, w8 +; NO_SVE-NEXT: ext v21.16b, v3.16b, v3.16b, #8 +; NO_SVE-NEXT: umov w8, v16.h[0] +; NO_SVE-NEXT: fmov s7, w9 +; NO_SVE-NEXT: mov v5.s[1], w10 +; NO_SVE-NEXT: umov w10, v19.h[0] +; NO_SVE-NEXT: umov w9, v19.h[2] +; NO_SVE-NEXT: mov v7.s[1], w11 +; NO_SVE-NEXT: umov w11, v16.h[1] +; NO_SVE-NEXT: fmov s16, w8 +; NO_SVE-NEXT: umov w8, v21.h[0] +; NO_SVE-NEXT: umov w12, v21.h[2] +; NO_SVE-NEXT: fmov s18, w10 +; NO_SVE-NEXT: umov w10, v19.h[1] +; NO_SVE-NEXT: fmov s17, w9 +; NO_SVE-NEXT: umov w9, v19.h[3] +; NO_SVE-NEXT: mov v16.s[1], w11 +; NO_SVE-NEXT: umov w11, v6.h[3] +; NO_SVE-NEXT: fmov s20, w8 +; NO_SVE-NEXT: umov w8, v21.h[1] +; NO_SVE-NEXT: fmov s19, w12 +; NO_SVE-NEXT: umov w12, v21.h[3] +; NO_SVE-NEXT: mov v18.s[1], w10 +; NO_SVE-NEXT: umov w10, v4.h[2] +; NO_SVE-NEXT: mov v17.s[1], w9 +; NO_SVE-NEXT: mov v20.s[1], w8 +; NO_SVE-NEXT: umov w8, v6.h[2] +; NO_SVE-NEXT: mov v19.s[1], w12 +; NO_SVE-NEXT: umov w9, v6.h[0] +; NO_SVE-NEXT: umov w12, v4.h[0] +; NO_SVE-NEXT: fmov s22, w10 +; NO_SVE-NEXT: umov w10, v4.h[1] +; NO_SVE-NEXT: fmov s21, w8 +; NO_SVE-NEXT: umov w8, v6.h[1] +; NO_SVE-NEXT: fmov s6, w9 +; NO_SVE-NEXT: umov w9, v4.h[3] +; NO_SVE-NEXT: fmov s4, w12 +; NO_SVE-NEXT: and v19.8b, v19.8b, v0.8b +; NO_SVE-NEXT: mov v21.s[1], w11 +; NO_SVE-NEXT: umov w11, v3.h[0] +; NO_SVE-NEXT: mov v6.s[1], w8 +; NO_SVE-NEXT: mov v4.s[1], w10 +; NO_SVE-NEXT: umov w8, v2.h[2] +; NO_SVE-NEXT: umov w10, v2.h[0] +; NO_SVE-NEXT: and v20.8b, v20.8b, v0.8b +; NO_SVE-NEXT: mov v22.s[1], w9 +; NO_SVE-NEXT: umov w9, v3.h[2] +; NO_SVE-NEXT: and v4.8b, v4.8b, v0.8b +; NO_SVE-NEXT: fmov s23, w8 +; NO_SVE-NEXT: umov w8, v3.h[3] +; NO_SVE-NEXT: fmov s25, w10 +; NO_SVE-NEXT: umov w10, v3.h[1] +; NO_SVE-NEXT: fmov s24, w9 +; NO_SVE-NEXT: fmov s3, w11 +; NO_SVE-NEXT: umov w9, v2.h[3] +; NO_SVE-NEXT: umov w11, v2.h[1] +; NO_SVE-NEXT: and v22.8b, v22.8b, v0.8b +; NO_SVE-NEXT: and v21.8b, v21.8b, v0.8b +; NO_SVE-NEXT: mov v24.s[1], w8 +; NO_SVE-NEXT: umov w8, v1.h[0] +; NO_SVE-NEXT: mov v3.s[1], w10 +; NO_SVE-NEXT: mov v23.s[1], w9 +; NO_SVE-NEXT: mov v25.s[1], w11 +; NO_SVE-NEXT: and v2.8b, v24.8b, v0.8b +; NO_SVE-NEXT: and v3.8b, v3.8b, v0.8b +; NO_SVE-NEXT: and v6.8b, v6.8b, v0.8b +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: and v23.8b, v23.8b, v0.8b +; NO_SVE-NEXT: ushll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: and v24.8b, v25.8b, v0.8b +; NO_SVE-NEXT: ushll v4.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v24.2d, v24.2s, #0 +; NO_SVE-NEXT: ushll v23.2d, v23.2s, #0 +; NO_SVE-NEXT: stp q3, q2, [x1, #192] +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: ushll v2.2d, v22.2s, #0 +; NO_SVE-NEXT: ucvtf v24.2d, v24.2d +; NO_SVE-NEXT: ucvtf v3.2d, v23.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q24, q3, [x1, #128] +; NO_SVE-NEXT: stp q4, q2, [x1, #64] +; NO_SVE-NEXT: fmov s4, w8 +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: and v2.8b, v7.8b, v0.8b +; NO_SVE-NEXT: ushll v3.2d, v19.2s, #0 +; NO_SVE-NEXT: and v7.8b, v16.8b, v0.8b +; NO_SVE-NEXT: ushll v19.2d, v20.2s, #0 +; NO_SVE-NEXT: ushll v20.2d, v21.2s, #0 +; NO_SVE-NEXT: and v16.8b, v17.8b, v0.8b +; NO_SVE-NEXT: and v17.8b, v18.8b, v0.8b +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v18.2d, v19.2d +; NO_SVE-NEXT: ucvtf v1.2d, v6.2d +; NO_SVE-NEXT: mov v4.s[1], w8 +; NO_SVE-NEXT: ucvtf v6.2d, v20.2d +; NO_SVE-NEXT: stp q18, q3, [x1, #224] +; NO_SVE-NEXT: and v3.8b, v5.8b, v0.8b +; NO_SVE-NEXT: and v0.8b, v4.8b, v0.8b +; NO_SVE-NEXT: stp q1, q6, [x1] +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll v6.2d, v16.2s, #0 +; NO_SVE-NEXT: ushll v7.2d, v17.2s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v4.2d, v7.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ucvtf v5.2d, v6.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: stp q2, q1, [x1, #96] +; NO_SVE-NEXT: stp q4, q5, [x1, #160] +; NO_SVE-NEXT: stp q0, q3, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i16_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z3.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: uunpklo z4.s, z1.h +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: uunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: uunpklo z6.s, z6.h +; VBITS_EQ_256-NEXT: movprfx z0, z5 +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #28 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z6.s +; VBITS_EQ_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: uunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: movprfx z0, z4 +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -341,6 +1025,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i32_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i32_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -353,6 +1044,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v4i32_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i32_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.4s, v0.4s @@ -363,6 +1060,16 @@ } define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) #0 { +; NO_SVE-LABEL: ucvtf_v8i32_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v8i32_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -378,6 +1085,23 @@ } define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i32_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v16i32_v16f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -412,6 +1136,63 @@ } define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i32_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #96] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ucvtf v4.4s, v4.4s +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ucvtf v5.4s, v5.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: ucvtf v6.4s, v6.4s +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: ucvtf v7.4s, v7.4s +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: stp q4, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i32_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ptrue p1.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: ucvtf z0.h, p1/m, z0.s +; VBITS_EQ_256-NEXT: ucvtf z2.h, p1/m, z2.s +; VBITS_EQ_256-NEXT: ucvtf z1.h, p1/m, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: ucvtf z3.h, p1/m, z3.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: splice z0.h, p0, z0.h, z1.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -429,6 +1210,111 @@ } define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v64i32_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #192] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ucvtf v4.4s, v4.4s +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: ucvtf v5.4s, v5.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: ucvtf v6.4s, v6.4s +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: ldp q16, q17, [x0, #64] +; NO_SVE-NEXT: ucvtf v7.4s, v7.4s +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: ucvtf v16.4s, v16.4s +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: ldp q18, q19, [x0, #224] +; NO_SVE-NEXT: ucvtf v17.4s, v17.4s +; NO_SVE-NEXT: fcvtn v16.4h, v16.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: ucvtf v18.4s, v18.4s +; NO_SVE-NEXT: fcvtn v17.4h, v17.4s +; NO_SVE-NEXT: ldp q20, q21, [x0, #128] +; NO_SVE-NEXT: ucvtf v19.4s, v19.4s +; NO_SVE-NEXT: fcvtn v18.4h, v18.4s +; NO_SVE-NEXT: mov v16.d[1], v17.d[0] +; NO_SVE-NEXT: ucvtf v20.4s, v20.4s +; NO_SVE-NEXT: fcvtn v19.4h, v19.4s +; NO_SVE-NEXT: ldp q22, q23, [x0, #160] +; NO_SVE-NEXT: ucvtf v21.4s, v21.4s +; NO_SVE-NEXT: stp q4, q2, [x1] +; NO_SVE-NEXT: fcvtn v20.4h, v20.4s +; NO_SVE-NEXT: stp q16, q6, [x1, #32] +; NO_SVE-NEXT: mov v18.d[1], v19.d[0] +; NO_SVE-NEXT: ucvtf v22.4s, v22.4s +; NO_SVE-NEXT: fcvtn v21.4h, v21.4s +; NO_SVE-NEXT: ucvtf v23.4s, v23.4s +; NO_SVE-NEXT: stp q0, q18, [x1, #96] +; NO_SVE-NEXT: fcvtn v22.4h, v22.4s +; NO_SVE-NEXT: mov v20.d[1], v21.d[0] +; NO_SVE-NEXT: fcvtn v23.4h, v23.4s +; NO_SVE-NEXT: mov v22.d[1], v23.d[0] +; NO_SVE-NEXT: stp q20, q22, [x1, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v64i32_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x12, #48 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #40 +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: ptrue p1.s +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ucvtf z1.h, p1/m, z1.s +; VBITS_EQ_256-NEXT: ucvtf z2.h, p1/m, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: ptrue p2.h, vl8 +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: splice z2.h, p2, z2.h, z1.h +; VBITS_EQ_256-NEXT: movprfx z1, z6 +; VBITS_EQ_256-NEXT: ucvtf z1.h, p1/m, z6.s +; VBITS_EQ_256-NEXT: ucvtf z5.h, p1/m, z5.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: ucvtf z3.h, p1/m, z3.s +; VBITS_EQ_256-NEXT: ucvtf z4.h, p1/m, z4.s +; VBITS_EQ_256-NEXT: splice z5.h, p2, z5.h, z1.h +; VBITS_EQ_256-NEXT: ucvtf z0.h, p1/m, z0.s +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: ucvtf z1.h, p1/m, z7.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: splice z4.h, p2, z4.h, z3.h +; VBITS_EQ_256-NEXT: splice z1.h, p2, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -451,6 +1337,11 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i32_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.2s, v0.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i32_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.2s, v0.2s @@ -461,6 +1352,11 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v4i32_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i32_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.4s, v0.4s @@ -470,6 +1366,14 @@ } define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i32_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v8i32_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -484,6 +1388,18 @@ } define void @ucvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i32_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v16i32_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -510,6 +1426,46 @@ } define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i32_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: ucvtf v4.4s, v4.4s +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: ucvtf v0.4s, v5.4s +; NO_SVE-NEXT: ucvtf v1.4s, v6.4s +; NO_SVE-NEXT: ucvtf v2.4s, v7.4s +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i32_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -524,6 +1480,78 @@ } define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v64i32_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: ucvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: ucvtf v1.4s, v1.4s +; NO_SVE-NEXT: ucvtf v2.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: ucvtf v3.4s, v3.4s +; NO_SVE-NEXT: ucvtf v4.4s, v4.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: ucvtf v2.4s, v7.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: ucvtf v0.4s, v5.4s +; NO_SVE-NEXT: ucvtf v1.4s, v6.4s +; NO_SVE-NEXT: ucvtf v3.4s, v16.4s +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: ucvtf v4.4s, v17.4s +; NO_SVE-NEXT: ucvtf v0.4s, v18.4s +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: ucvtf v1.4s, v19.4s +; NO_SVE-NEXT: ucvtf v2.4s, v20.4s +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: ucvtf v3.4s, v21.4s +; NO_SVE-NEXT: ucvtf v4.4s, v22.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: ucvtf v0.4s, v23.4s +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v64i32_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #56 +; VBITS_EQ_256-NEXT: mov x14, #32 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: ucvtf z5.s, p0/m, z5.s +; VBITS_EQ_256-NEXT: ucvtf z4.s, p0/m, z4.s +; VBITS_EQ_256-NEXT: ucvtf z6.s, p0/m, z6.s +; VBITS_EQ_256-NEXT: ucvtf z7.s, p0/m, z7.s +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -543,6 +1571,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v1i32_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v1i32_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 @@ -555,6 +1590,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i32_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 @@ -565,6 +1606,16 @@ } define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v4i32_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -580,6 +1631,21 @@ } define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i32_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ushll v2.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v3.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q3, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v8i32_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 @@ -611,6 +1677,57 @@ } define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i32_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ushll v6.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v3.4s, #0 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ushll v7.2d, v2.2s, #0 +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ushll2 v2.2d, v2.4s, #0 +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ucvtf v6.2d, v6.2d +; NO_SVE-NEXT: ushll v4.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v5.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: stp q6, q3, [x1, #64] +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: ucvtf v2.2d, v5.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v3.2d, v4.2d +; NO_SVE-NEXT: stp q2, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v16i32_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z0.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z1.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -627,6 +1744,99 @@ } define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i32_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ushll v18.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: ucvtf v18.2d, v18.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ushll v19.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: ucvtf v19.2d, v19.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ushll v17.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll2 v4.2d, v4.4s, #0 +; NO_SVE-NEXT: ucvtf v17.2d, v17.2d +; NO_SVE-NEXT: ushll v6.2d, v3.2s, #0 +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: ushll v16.2d, v5.2s, #0 +; NO_SVE-NEXT: ldp q21, q20, [x0] +; NO_SVE-NEXT: ushll v7.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q18, q1, [x1, #192] +; NO_SVE-NEXT: ushll2 v2.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q17, q4, [x1, #160] +; NO_SVE-NEXT: ushll2 v1.2d, v3.4s, #0 +; NO_SVE-NEXT: stp q19, q0, [x1, #224] +; NO_SVE-NEXT: ucvtf v6.2d, v6.2d +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v16.2d, v16.2d +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v22.2d, v21.2s, #0 +; NO_SVE-NEXT: ushll v0.2d, v20.2s, #0 +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: ushll2 v3.2d, v20.4s, #0 +; NO_SVE-NEXT: stp q6, q1, [x1, #64] +; NO_SVE-NEXT: ushll2 v4.2d, v21.4s, #0 +; NO_SVE-NEXT: ucvtf v5.2d, v5.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v2.2d, v4.2d +; NO_SVE-NEXT: ucvtf v1.2d, v22.2d +; NO_SVE-NEXT: stp q16, q5, [x1, #128] +; NO_SVE-NEXT: stp q0, q3, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i32_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x11, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: uunpklo z4.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z5.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z6.d, z2.s +; VBITS_EQ_256-NEXT: ucvtf z4.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: ucvtf z5.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: ucvtf z6.d, p0/m, z6.d +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: uunpklo z7.d, z3.s +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #20 +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z7.d, p0/m, z7.d +; VBITS_EQ_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -649,6 +1859,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v1i64_v1f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ucvtf h0, x8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v1i64_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -661,6 +1878,16 @@ ; v2f16 is not legal for NEON, so use SVE define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i64_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: mov x8, v0.d[1] +; NO_SVE-NEXT: fmov x9, d0 +; NO_SVE-NEXT: ucvtf h0, x9 +; NO_SVE-NEXT: ucvtf h1, x8 +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i64_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -675,6 +1902,16 @@ } define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) #0 { +; NO_SVE-LABEL: ucvtf_v4i64_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v1.2d +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i64_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -691,6 +1928,23 @@ } define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 { +; NO_SVE-LABEL: ucvtf_v8i64_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q1, q3, [x0] +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v2.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: fcvtn v2.4h, v0.4s +; NO_SVE-NEXT: fcvtn2 v1.4s, v3.2d +; NO_SVE-NEXT: fcvtn v0.4h, v1.4s +; NO_SVE-NEXT: mov v0.d[1], v2.d[0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -724,6 +1978,68 @@ } define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i64_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #64] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q5, q4, [x0] +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: ucvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v2.2d +; NO_SVE-NEXT: fcvtn v5.2s, v5.2d +; NO_SVE-NEXT: ucvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn v0.4h, v3.4s +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v5.4s, v4.2d +; NO_SVE-NEXT: fcvtn v6.2s, v6.2d +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: fcvtn v3.4h, v5.4s +; NO_SVE-NEXT: fcvtn2 v6.4s, v7.2d +; NO_SVE-NEXT: fcvtn v2.4h, v6.4s +; NO_SVE-NEXT: mov v3.d[1], v2.d[0] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v16i64_v16f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z3.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: uzp1 z1.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: mov v2.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -742,6 +2058,122 @@ } define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i64_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q2, q5, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q6, q4, [x0, #160] +; NO_SVE-NEXT: ucvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v2.2s, v2.2d +; NO_SVE-NEXT: ucvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q16, q7, [x0, #224] +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v2.4s, v5.2d +; NO_SVE-NEXT: fcvtn v6.2s, v6.2d +; NO_SVE-NEXT: ucvtf v16.2d, v16.2d +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: ldp q18, q17, [x0, #32] +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v6.4s, v4.2d +; NO_SVE-NEXT: fcvtn v16.2s, v16.2d +; NO_SVE-NEXT: ucvtf v18.2d, v18.2d +; NO_SVE-NEXT: fcvtn v0.4h, v6.4s +; NO_SVE-NEXT: ldp q19, q3, [x0, #64] +; NO_SVE-NEXT: ucvtf v17.2d, v17.2d +; NO_SVE-NEXT: fcvtn2 v16.4s, v7.2d +; NO_SVE-NEXT: fcvtn v18.2s, v18.2d +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: ucvtf v19.2d, v19.2d +; NO_SVE-NEXT: ldp q21, q20, [x0, #192] +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v18.4s, v17.2d +; NO_SVE-NEXT: fcvtn v19.2s, v19.2d +; NO_SVE-NEXT: ucvtf v21.2d, v21.2d +; NO_SVE-NEXT: fcvtn v5.4h, v18.4s +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: ucvtf v20.2d, v20.2d +; NO_SVE-NEXT: fcvtn2 v19.4s, v3.2d +; NO_SVE-NEXT: fcvtn v21.2s, v21.2d +; NO_SVE-NEXT: fcvtn v3.4h, v16.4s +; NO_SVE-NEXT: ucvtf v23.2d, v23.2d +; NO_SVE-NEXT: fcvtn v7.4h, v19.4s +; NO_SVE-NEXT: ucvtf v22.2d, v22.2d +; NO_SVE-NEXT: fcvtn2 v21.4s, v20.2d +; NO_SVE-NEXT: fcvtn v23.2s, v23.2d +; NO_SVE-NEXT: mov v7.d[1], v2.d[0] +; NO_SVE-NEXT: fcvtn v4.4h, v21.4s +; NO_SVE-NEXT: fcvtn2 v23.4s, v22.2d +; NO_SVE-NEXT: mov v4.d[1], v3.d[0] +; NO_SVE-NEXT: fcvtn v6.4h, v23.4s +; NO_SVE-NEXT: stp q1, q4, [x1, #32] +; NO_SVE-NEXT: mov v6.d[1], v5.d[0] +; NO_SVE-NEXT: stp q6, q7, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i64_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: mov x11, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x13, #20 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v1.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: movprfx z2, z6 +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z6.d +; VBITS_EQ_256-NEXT: ucvtf z5.h, p0/m, z5.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: ucvtf z3.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: mov v5.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: movprfx z2, z4 +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z4.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: movprfx z2, z7 +; VBITS_EQ_256-NEXT: ucvtf z2.h, p0/m, z7.d +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: mov v2.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z5.h, p0, z5.h, z1.h +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z3.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -765,6 +2197,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v1i64_v1f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v1i64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -777,6 +2216,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i64_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i64_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.2d, v0.2d @@ -787,6 +2232,15 @@ } define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) #0 { +; NO_SVE-LABEL: ucvtf_v4i64_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v1.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i64_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -802,6 +2256,21 @@ } define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i64_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q1, q3, [x0] +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v2.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v3.2d +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -836,6 +2305,59 @@ } define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i64_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #64] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q2, q5, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q3, q6, [x0] +; NO_SVE-NEXT: ucvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v2.2s, v2.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q4, q7, [x0, #32] +; NO_SVE-NEXT: ucvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn2 v2.4s, v5.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: stp q1, q2, [x1, #32] +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v6.2d +; NO_SVE-NEXT: fcvtn v4.2s, v4.2d +; NO_SVE-NEXT: fcvtn2 v4.4s, v7.2d +; NO_SVE-NEXT: stp q3, q4, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v16i64_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ptrue p1.d +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: ucvtf z0.s, p1/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z2.s, p1/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z1.s, p1/m, z1.d +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: ucvtf z3.s, p1/m, z3.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: splice z2.s, p0, z2.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z0.s, p0, z0.s, z1.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -853,6 +2375,103 @@ } define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v32i64_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #192] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q5, q4, [x0, #224] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: ucvtf v5.2d, v5.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v5.2s, v5.2d +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: fcvtn2 v5.4s, v4.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: ucvtf v7.2d, v7.2d +; NO_SVE-NEXT: ldp q17, q16, [x0] +; NO_SVE-NEXT: ucvtf v0.2d, v6.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v2.2d +; NO_SVE-NEXT: fcvtn v7.2s, v7.2d +; NO_SVE-NEXT: ucvtf v17.2d, v17.2d +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: ucvtf v4.2d, v16.2d +; NO_SVE-NEXT: fcvtn2 v7.4s, v0.2d +; NO_SVE-NEXT: fcvtn v17.2s, v17.2d +; NO_SVE-NEXT: ucvtf v19.2d, v19.2d +; NO_SVE-NEXT: ldp q21, q20, [x0, #128] +; NO_SVE-NEXT: ucvtf v18.2d, v18.2d +; NO_SVE-NEXT: fcvtn2 v17.4s, v4.2d +; NO_SVE-NEXT: fcvtn v19.2s, v19.2d +; NO_SVE-NEXT: ucvtf v21.2d, v21.2d +; NO_SVE-NEXT: ldp q23, q22, [x0, #160] +; NO_SVE-NEXT: ucvtf v20.2d, v20.2d +; NO_SVE-NEXT: fcvtn2 v19.4s, v18.2d +; NO_SVE-NEXT: fcvtn v21.2s, v21.2d +; NO_SVE-NEXT: stp q17, q3, [x1] +; NO_SVE-NEXT: stp q1, q5, [x1, #96] +; NO_SVE-NEXT: ucvtf v23.2d, v23.2d +; NO_SVE-NEXT: stp q19, q7, [x1, #32] +; NO_SVE-NEXT: ucvtf v22.2d, v22.2d +; NO_SVE-NEXT: fcvtn2 v21.4s, v20.2d +; NO_SVE-NEXT: fcvtn v23.2s, v23.2d +; NO_SVE-NEXT: fcvtn2 v23.4s, v22.2d +; NO_SVE-NEXT: stp q21, q23, [x1, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i64_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x11, #8 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #20 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ucvtf z1.s, p1/m, z1.d +; VBITS_EQ_256-NEXT: ucvtf z2.s, p1/m, z2.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: ptrue p2.s, vl4 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: splice z2.s, p2, z2.s, z1.s +; VBITS_EQ_256-NEXT: movprfx z1, z6 +; VBITS_EQ_256-NEXT: ucvtf z1.s, p1/m, z6.d +; VBITS_EQ_256-NEXT: ucvtf z5.s, p1/m, z5.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: ucvtf z3.s, p1/m, z3.d +; VBITS_EQ_256-NEXT: ucvtf z4.s, p1/m, z4.d +; VBITS_EQ_256-NEXT: splice z5.s, p2, z5.s, z1.s +; VBITS_EQ_256-NEXT: ucvtf z0.s, p1/m, z0.d +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: ucvtf z1.s, p1/m, z7.d +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: splice z4.s, p2, z4.s, z3.s +; VBITS_EQ_256-NEXT: splice z1.s, p2, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -875,6 +2494,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v1i64_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ucvtf d0, x8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v1i64_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -887,6 +2513,11 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: ucvtf_v2i64_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v2i64_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf v0.2d, v0.2d @@ -896,6 +2527,14 @@ } define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v4i64_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: ucvtf_v4i64_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -910,6 +2549,18 @@ } define void @ucvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v8i64_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -936,6 +2587,46 @@ } define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: ucvtf_v16i64_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: ucvtf v0.2d, v5.2d +; NO_SVE-NEXT: ucvtf v1.2d, v6.2d +; NO_SVE-NEXT: ucvtf v2.2d, v7.2d +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v16i64_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -950,7 +2641,79 @@ } define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 { -; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f64: +; NO_SVE-LABEL: ucvtf_v32i64_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: ucvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: ucvtf v1.2d, v1.2d +; NO_SVE-NEXT: ucvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: ucvtf v3.2d, v3.2d +; NO_SVE-NEXT: ucvtf v4.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: ucvtf v2.2d, v7.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: ucvtf v0.2d, v5.2d +; NO_SVE-NEXT: ucvtf v1.2d, v6.2d +; NO_SVE-NEXT: ucvtf v3.2d, v16.2d +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: ucvtf v4.2d, v17.2d +; NO_SVE-NEXT: ucvtf v0.2d, v18.2d +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: ucvtf v1.2d, v19.2d +; NO_SVE-NEXT: ucvtf v2.2d, v20.2d +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: ucvtf v3.2d, v21.2d +; NO_SVE-NEXT: ucvtf v4.2d, v22.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: ucvtf v0.2d, v23.2d +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: ucvtf_v32i64_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ucvtf z5.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: ucvtf z4.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: ucvtf z6.d, p0/m, z6.d +; VBITS_EQ_256-NEXT: ucvtf z7.d, p0/m, z7.d +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] @@ -969,6 +2732,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 { +; NO_SVE-LABEL: scvtf_v4i16_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.4h, v0.4h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i16_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.4h, v0.4h @@ -979,6 +2747,13 @@ ; Don't use SVE for 128-bit vectors. define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i16_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: scvtf v0.8h, v0.8h +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v8i16_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -992,6 +2767,14 @@ } define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i16_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.8h, v0.8h +; NO_SVE-NEXT: scvtf v1.8h, v1.8h +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v16i16_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -1006,6 +2789,18 @@ } define void @scvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i16_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: scvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: scvtf v1.8h, v1.8h +; NO_SVE-NEXT: scvtf v2.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: scvtf v3.8h, v3.8h +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v32i16_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -1032,6 +2827,46 @@ } define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v64i16_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: scvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: scvtf v1.8h, v1.8h +; NO_SVE-NEXT: scvtf v2.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: scvtf v3.8h, v3.8h +; NO_SVE-NEXT: scvtf v4.8h, v4.8h +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: scvtf v0.8h, v5.8h +; NO_SVE-NEXT: scvtf v1.8h, v6.8h +; NO_SVE-NEXT: scvtf v2.8h, v7.8h +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v64i16_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.h +; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.h +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z2.h +; VBITS_EQ_256-NEXT: scvtf z3.h, p0/m, z3.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v64i16_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -1046,6 +2881,78 @@ } define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v128i16_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: scvtf v0.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: scvtf v1.8h, v1.8h +; NO_SVE-NEXT: scvtf v2.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: scvtf v3.8h, v3.8h +; NO_SVE-NEXT: scvtf v4.8h, v4.8h +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: scvtf v2.8h, v7.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: scvtf v0.8h, v5.8h +; NO_SVE-NEXT: scvtf v1.8h, v6.8h +; NO_SVE-NEXT: scvtf v3.8h, v16.8h +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: scvtf v4.8h, v17.8h +; NO_SVE-NEXT: scvtf v0.8h, v18.8h +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: scvtf v1.8h, v19.8h +; NO_SVE-NEXT: scvtf v2.8h, v20.8h +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: scvtf v3.8h, v21.8h +; NO_SVE-NEXT: scvtf v4.8h, v22.8h +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: scvtf v0.8h, v23.8h +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v128i16_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #96 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #112 +; VBITS_EQ_256-NEXT: mov x14, #64 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.h +; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.h +; VBITS_EQ_256-NEXT: scvtf z3.h, p0/m, z3.h +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z2.h +; VBITS_EQ_256-NEXT: scvtf z5.h, p0/m, z5.h +; VBITS_EQ_256-NEXT: scvtf z4.h, p0/m, z4.h +; VBITS_EQ_256-NEXT: scvtf z6.h, p0/m, z6.h +; VBITS_EQ_256-NEXT: scvtf z7.h, p0/m, z7.h +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v128i16_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -1065,6 +2972,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i16_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: scvtf v0.2s, v0.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i16_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #16 @@ -1077,6 +2991,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 { +; NO_SVE-LABEL: scvtf_v4i16_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 @@ -1087,6 +3007,16 @@ } define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i16_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v8i16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -1102,6 +3032,21 @@ } define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i16_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: sshll v2.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v3.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: stp q3, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v16i16_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 @@ -1133,6 +3078,57 @@ } define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i16_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: sshll v6.4s, v3.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v3.8h, #0 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: sshll v7.4s, v2.4h, #0 +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: sshll2 v2.4s, v2.8h, #0 +; NO_SVE-NEXT: scvtf v7.4s, v7.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: scvtf v6.4s, v6.4s +; NO_SVE-NEXT: sshll v4.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll v5.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: stp q6, q3, [x1, #64] +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: scvtf v2.4s, v5.4s +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v3.4s, v4.4s +; NO_SVE-NEXT: stp q2, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i16_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 @@ -1149,6 +3145,99 @@ } define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v64i16_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: sshll v18.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v1.8h, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: scvtf v18.4s, v18.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: sshll v19.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: scvtf v19.4s, v19.4s +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: sshll v17.4s, v4.4h, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v4.8h, #0 +; NO_SVE-NEXT: scvtf v17.4s, v17.4s +; NO_SVE-NEXT: sshll v6.4s, v3.4h, #0 +; NO_SVE-NEXT: scvtf v4.4s, v4.4s +; NO_SVE-NEXT: sshll v16.4s, v5.4h, #0 +; NO_SVE-NEXT: ldp q21, q20, [x0] +; NO_SVE-NEXT: sshll v7.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q18, q1, [x1, #192] +; NO_SVE-NEXT: sshll2 v2.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q17, q4, [x1, #160] +; NO_SVE-NEXT: sshll2 v1.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q19, q0, [x1, #224] +; NO_SVE-NEXT: scvtf v6.4s, v6.4s +; NO_SVE-NEXT: scvtf v7.4s, v7.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v16.4s, v16.4s +; NO_SVE-NEXT: sshll2 v5.4s, v5.8h, #0 +; NO_SVE-NEXT: sshll v22.4s, v21.4h, #0 +; NO_SVE-NEXT: sshll v0.4s, v20.4h, #0 +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: sshll2 v3.4s, v20.8h, #0 +; NO_SVE-NEXT: stp q6, q1, [x1, #64] +; NO_SVE-NEXT: sshll2 v4.4s, v21.8h, #0 +; NO_SVE-NEXT: scvtf v5.4s, v5.4s +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: scvtf v2.4s, v4.4s +; NO_SVE-NEXT: scvtf v1.4s, v22.4s +; NO_SVE-NEXT: stp q16, q5, [x1, #128] +; NO_SVE-NEXT: stp q0, q3, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v64i16_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #32 +; VBITS_EQ_256-NEXT: mov x10, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z2.h +; VBITS_EQ_256-NEXT: scvtf z4.s, p0/m, z4.s +; VBITS_EQ_256-NEXT: scvtf z5.s, p0/m, z5.s +; VBITS_EQ_256-NEXT: scvtf z6.s, p0/m, z6.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.s, z3.h +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: mov x10, #40 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: scvtf z7.s, p0/m, z7.s +; VBITS_EQ_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 @@ -1170,6 +3259,13 @@ ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 { +; NO_SVE-LABEL: scvtf_v1i16_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: smov w8, v0.h[0] +; NO_SVE-NEXT: scvtf d0, w8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v1i16_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 @@ -1185,6 +3281,14 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i16_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i16_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #16 @@ -1197,6 +3301,28 @@ } define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v4i16_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: umov w8, v0.h[2] +; NO_SVE-NEXT: umov w9, v0.h[0] +; NO_SVE-NEXT: umov w10, v0.h[3] +; NO_SVE-NEXT: fmov s1, w8 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: mov v1.s[1], w10 +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: shl v1.2s, v1.2s, #16 +; NO_SVE-NEXT: shl v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshr v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -1213,6 +3339,46 @@ } define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i16_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: umov w8, v0.h[0] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w11, v0.h[1] +; NO_SVE-NEXT: umov w10, v1.h[0] +; NO_SVE-NEXT: umov w12, v1.h[2] +; NO_SVE-NEXT: fmov s2, w8 +; NO_SVE-NEXT: umov w8, v0.h[3] +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: umov w9, v1.h[1] +; NO_SVE-NEXT: fmov s3, w10 +; NO_SVE-NEXT: umov w10, v1.h[3] +; NO_SVE-NEXT: fmov s1, w12 +; NO_SVE-NEXT: mov v2.s[1], w11 +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: mov v3.s[1], w9 +; NO_SVE-NEXT: mov v1.s[1], w10 +; NO_SVE-NEXT: shl v2.2s, v2.2s, #16 +; NO_SVE-NEXT: shl v0.2s, v0.2s, #16 +; NO_SVE-NEXT: shl v3.2s, v3.2s, #16 +; NO_SVE-NEXT: sshr v2.2s, v2.2s, #16 +; NO_SVE-NEXT: shl v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshr v3.2s, v3.2s, #16 +; NO_SVE-NEXT: sshr v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: stp q3, q1, [x1, #32] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v8i16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] @@ -1245,6 +3411,111 @@ } define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i16_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: umov w8, v1.h[0] +; NO_SVE-NEXT: umov w9, v1.h[1] +; NO_SVE-NEXT: umov w10, v2.h[0] +; NO_SVE-NEXT: umov w11, v2.h[2] +; NO_SVE-NEXT: fmov s3, w8 +; NO_SVE-NEXT: umov w8, v1.h[2] +; NO_SVE-NEXT: fmov s5, w10 +; NO_SVE-NEXT: umov w10, v2.h[3] +; NO_SVE-NEXT: mov v3.s[1], w9 +; NO_SVE-NEXT: umov w9, v1.h[3] +; NO_SVE-NEXT: fmov s4, w8 +; NO_SVE-NEXT: umov w8, v2.h[1] +; NO_SVE-NEXT: fmov s2, w11 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: mov v4.s[1], w9 +; NO_SVE-NEXT: mov v2.s[1], w10 +; NO_SVE-NEXT: mov v5.s[1], w8 +; NO_SVE-NEXT: umov w8, v0.h[0] +; NO_SVE-NEXT: umov w9, v1.h[0] +; NO_SVE-NEXT: umov w11, v1.h[2] +; NO_SVE-NEXT: umov w10, v0.h[2] +; NO_SVE-NEXT: shl v2.2s, v2.2s, #16 +; NO_SVE-NEXT: fmov s6, w8 +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: fmov s7, w9 +; NO_SVE-NEXT: umov w9, v1.h[3] +; NO_SVE-NEXT: fmov s1, w11 +; NO_SVE-NEXT: umov w11, v0.h[1] +; NO_SVE-NEXT: fmov s16, w10 +; NO_SVE-NEXT: umov w10, v0.h[3] +; NO_SVE-NEXT: shl v0.2s, v3.2s, #16 +; NO_SVE-NEXT: shl v3.2s, v4.2s, #16 +; NO_SVE-NEXT: mov v7.s[1], w8 +; NO_SVE-NEXT: shl v4.2s, v5.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: sshr v3.2s, v3.2s, #16 +; NO_SVE-NEXT: sshr v2.2s, v2.2s, #16 +; NO_SVE-NEXT: mov v1.s[1], w9 +; NO_SVE-NEXT: mov v6.s[1], w11 +; NO_SVE-NEXT: mov v16.s[1], w10 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: shl v5.2s, v7.2s, #16 +; NO_SVE-NEXT: shl v1.2s, v1.2s, #16 +; NO_SVE-NEXT: shl v6.2s, v6.2s, #16 +; NO_SVE-NEXT: shl v7.2s, v16.2s, #16 +; NO_SVE-NEXT: sshr v5.2s, v5.2s, #16 +; NO_SVE-NEXT: sshr v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshr v6.2s, v6.2s, #16 +; NO_SVE-NEXT: sshr v7.2s, v7.2s, #16 +; NO_SVE-NEXT: stp q0, q3, [x1, #64] +; NO_SVE-NEXT: sshr v0.2s, v4.2s, #16 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v3.2d, v5.2d +; NO_SVE-NEXT: scvtf v4.2d, v7.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v5.2d, v6.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: stp q3, q1, [x1, #96] +; NO_SVE-NEXT: stp q5, q4, [x1] +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v16i16_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov z1.d, z0.d +; VBITS_EQ_256-NEXT: sunpklo z2.s, z0.h +; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.s, z3.h +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -1262,6 +3533,207 @@ } define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i16_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; NO_SVE-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: umov w8, v5.h[2] +; NO_SVE-NEXT: umov w9, v5.h[3] +; NO_SVE-NEXT: umov w11, v5.h[1] +; NO_SVE-NEXT: umov w10, v6.h[2] +; NO_SVE-NEXT: umov w12, v6.h[3] +; NO_SVE-NEXT: ext v17.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: fmov s4, w8 +; NO_SVE-NEXT: umov w8, v5.h[0] +; NO_SVE-NEXT: ext v19.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: mov v4.s[1], w9 +; NO_SVE-NEXT: umov w9, v6.h[0] +; NO_SVE-NEXT: fmov s5, w8 +; NO_SVE-NEXT: umov w8, v6.h[1] +; NO_SVE-NEXT: fmov s6, w10 +; NO_SVE-NEXT: umov w10, v17.h[2] +; NO_SVE-NEXT: shl v4.2s, v4.2s, #16 +; NO_SVE-NEXT: fmov s7, w9 +; NO_SVE-NEXT: umov w9, v17.h[0] +; NO_SVE-NEXT: mov v6.s[1], w12 +; NO_SVE-NEXT: umov w12, v19.h[0] +; NO_SVE-NEXT: mov v5.s[1], w11 +; NO_SVE-NEXT: mov v7.s[1], w8 +; NO_SVE-NEXT: umov w8, v19.h[2] +; NO_SVE-NEXT: umov w11, v17.h[3] +; NO_SVE-NEXT: fmov s16, w10 +; NO_SVE-NEXT: umov w10, v17.h[1] +; NO_SVE-NEXT: fmov s17, w9 +; NO_SVE-NEXT: umov w9, v19.h[3] +; NO_SVE-NEXT: shl v6.2s, v6.2s, #16 +; NO_SVE-NEXT: fmov s18, w8 +; NO_SVE-NEXT: umov w8, v19.h[1] +; NO_SVE-NEXT: fmov s19, w12 +; NO_SVE-NEXT: umov w12, v2.h[0] +; NO_SVE-NEXT: mov v17.s[1], w10 +; NO_SVE-NEXT: umov w10, v2.h[2] +; NO_SVE-NEXT: mov v18.s[1], w9 +; NO_SVE-NEXT: mov v19.s[1], w8 +; NO_SVE-NEXT: umov w8, v3.h[2] +; NO_SVE-NEXT: umov w9, v3.h[0] +; NO_SVE-NEXT: fmov s21, w10 +; NO_SVE-NEXT: mov v16.s[1], w11 +; NO_SVE-NEXT: umov w11, v3.h[3] +; NO_SVE-NEXT: umov w10, v2.h[1] +; NO_SVE-NEXT: fmov s20, w8 +; NO_SVE-NEXT: umov w8, v3.h[1] +; NO_SVE-NEXT: fmov s3, w9 +; NO_SVE-NEXT: umov w9, v2.h[3] +; NO_SVE-NEXT: fmov s2, w12 +; NO_SVE-NEXT: shl v18.2s, v18.2s, #16 +; NO_SVE-NEXT: mov v20.s[1], w11 +; NO_SVE-NEXT: umov w11, v0.h[0] +; NO_SVE-NEXT: mov v3.s[1], w8 +; NO_SVE-NEXT: mov v21.s[1], w9 +; NO_SVE-NEXT: umov w8, v0.h[2] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: fmov s24, w11 +; NO_SVE-NEXT: mov v2.s[1], w10 +; NO_SVE-NEXT: umov w10, v1.h[0] +; NO_SVE-NEXT: umov w11, v0.h[1] +; NO_SVE-NEXT: fmov s22, w8 +; NO_SVE-NEXT: umov w8, v1.h[3] +; NO_SVE-NEXT: fmov s23, w9 +; NO_SVE-NEXT: umov w9, v1.h[1] +; NO_SVE-NEXT: fmov s1, w10 +; NO_SVE-NEXT: umov w10, v0.h[3] +; NO_SVE-NEXT: mov v24.s[1], w11 +; NO_SVE-NEXT: shl v21.2s, v21.2s, #16 +; NO_SVE-NEXT: mov v23.s[1], w8 +; NO_SVE-NEXT: mov v1.s[1], w9 +; NO_SVE-NEXT: mov v22.s[1], w10 +; NO_SVE-NEXT: shl v2.2s, v2.2s, #16 +; NO_SVE-NEXT: shl v0.2s, v23.2s, #16 +; NO_SVE-NEXT: shl v23.2s, v24.2s, #16 +; NO_SVE-NEXT: shl v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshr v21.2s, v21.2s, #16 +; NO_SVE-NEXT: sshr v0.2s, v0.2s, #16 +; NO_SVE-NEXT: shl v22.2s, v22.2s, #16 +; NO_SVE-NEXT: sshr v1.2s, v1.2s, #16 +; NO_SVE-NEXT: sshr v23.2s, v23.2s, #16 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshr v22.2s, v22.2s, #16 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: sshr v2.2s, v2.2s, #16 +; NO_SVE-NEXT: sshll v23.2d, v23.2s, #0 +; NO_SVE-NEXT: shl v20.2s, v20.2s, #16 +; NO_SVE-NEXT: sshll v22.2d, v22.2s, #0 +; NO_SVE-NEXT: shl v3.2s, v3.2s, #16 +; NO_SVE-NEXT: scvtf v23.2d, v23.2d +; NO_SVE-NEXT: stp q1, q0, [x1, #192] +; NO_SVE-NEXT: scvtf v1.2d, v22.2d +; NO_SVE-NEXT: sshr v20.2s, v20.2s, #16 +; NO_SVE-NEXT: sshll v0.2d, v2.2s, #0 +; NO_SVE-NEXT: sshr v3.2s, v3.2s, #16 +; NO_SVE-NEXT: sshll v2.2d, v21.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: shl v19.2s, v19.2s, #16 +; NO_SVE-NEXT: stp q23, q1, [x1, #128] +; NO_SVE-NEXT: sshr v18.2s, v18.2s, #16 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: sshr v19.2s, v19.2s, #16 +; NO_SVE-NEXT: sshll v3.2d, v20.2s, #0 +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q0, q2, [x1, #64] +; NO_SVE-NEXT: scvtf v0.2d, v3.2d +; NO_SVE-NEXT: shl v5.2s, v5.2s, #16 +; NO_SVE-NEXT: sshll v2.2d, v18.2s, #0 +; NO_SVE-NEXT: shl v7.2s, v7.2s, #16 +; NO_SVE-NEXT: sshll v3.2d, v19.2s, #0 +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: scvtf v1.2d, v3.2d +; NO_SVE-NEXT: shl v16.2s, v16.2s, #16 +; NO_SVE-NEXT: shl v17.2s, v17.2s, #16 +; NO_SVE-NEXT: sshr v6.2s, v6.2s, #16 +; NO_SVE-NEXT: sshr v7.2s, v7.2s, #16 +; NO_SVE-NEXT: sshr v16.2s, v16.2s, #16 +; NO_SVE-NEXT: sshr v17.2s, v17.2s, #16 +; NO_SVE-NEXT: stp q1, q2, [x1, #224] +; NO_SVE-NEXT: sshr v1.2s, v4.2s, #16 +; NO_SVE-NEXT: sshr v2.2s, v5.2s, #16 +; NO_SVE-NEXT: sshll v0.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll v3.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll v6.2d, v16.2s, #0 +; NO_SVE-NEXT: sshll v7.2d, v17.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v4.2d, v7.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: scvtf v5.2d, v6.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q3, q0, [x1, #96] +; NO_SVE-NEXT: stp q4, q5, [x1, #160] +; NO_SVE-NEXT: stp q2, q1, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i16_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: sunpklo z4.s, z1.h +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z6.h +; VBITS_EQ_256-NEXT: movprfx z0, z5 +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #28 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z6.s +; VBITS_EQ_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: movprfx z0, z4 +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -1284,6 +3756,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i32_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i32_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -1296,6 +3775,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v4i32_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i32_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.4s, v0.4s @@ -1306,6 +3791,16 @@ } define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) #0 { +; NO_SVE-LABEL: scvtf_v8i32_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v8i32_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -1321,6 +3816,23 @@ } define void @scvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i32_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v16i32_v16f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -1355,6 +3867,63 @@ } define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i32_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #96] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: scvtf v4.4s, v4.4s +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: scvtf v5.4s, v5.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: scvtf v6.4s, v6.4s +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: scvtf v7.4s, v7.4s +; NO_SVE-NEXT: stp q0, q2, [x1, #32] +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: stp q4, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i32_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ptrue p1.s +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: scvtf z0.h, p1/m, z0.s +; VBITS_EQ_256-NEXT: scvtf z2.h, p1/m, z2.s +; VBITS_EQ_256-NEXT: scvtf z1.h, p1/m, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: scvtf z3.h, p1/m, z3.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: splice z0.h, p0, z0.h, z1.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -1372,6 +3941,111 @@ } define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v64i32_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #192] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: scvtf v4.4s, v4.4s +; NO_SVE-NEXT: fcvtn v3.4h, v3.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: scvtf v5.4s, v5.4s +; NO_SVE-NEXT: fcvtn v4.4h, v4.4s +; NO_SVE-NEXT: mov v2.d[1], v3.d[0] +; NO_SVE-NEXT: scvtf v6.4s, v6.4s +; NO_SVE-NEXT: fcvtn v5.4h, v5.4s +; NO_SVE-NEXT: ldp q16, q17, [x0, #64] +; NO_SVE-NEXT: scvtf v7.4s, v7.4s +; NO_SVE-NEXT: fcvtn v6.4h, v6.4s +; NO_SVE-NEXT: mov v4.d[1], v5.d[0] +; NO_SVE-NEXT: scvtf v16.4s, v16.4s +; NO_SVE-NEXT: fcvtn v7.4h, v7.4s +; NO_SVE-NEXT: ldp q18, q19, [x0, #224] +; NO_SVE-NEXT: scvtf v17.4s, v17.4s +; NO_SVE-NEXT: fcvtn v16.4h, v16.4s +; NO_SVE-NEXT: mov v6.d[1], v7.d[0] +; NO_SVE-NEXT: scvtf v18.4s, v18.4s +; NO_SVE-NEXT: fcvtn v17.4h, v17.4s +; NO_SVE-NEXT: ldp q20, q21, [x0, #128] +; NO_SVE-NEXT: scvtf v19.4s, v19.4s +; NO_SVE-NEXT: fcvtn v18.4h, v18.4s +; NO_SVE-NEXT: mov v16.d[1], v17.d[0] +; NO_SVE-NEXT: scvtf v20.4s, v20.4s +; NO_SVE-NEXT: fcvtn v19.4h, v19.4s +; NO_SVE-NEXT: ldp q22, q23, [x0, #160] +; NO_SVE-NEXT: scvtf v21.4s, v21.4s +; NO_SVE-NEXT: stp q4, q2, [x1] +; NO_SVE-NEXT: fcvtn v20.4h, v20.4s +; NO_SVE-NEXT: stp q16, q6, [x1, #32] +; NO_SVE-NEXT: mov v18.d[1], v19.d[0] +; NO_SVE-NEXT: scvtf v22.4s, v22.4s +; NO_SVE-NEXT: fcvtn v21.4h, v21.4s +; NO_SVE-NEXT: scvtf v23.4s, v23.4s +; NO_SVE-NEXT: stp q0, q18, [x1, #96] +; NO_SVE-NEXT: fcvtn v22.4h, v22.4s +; NO_SVE-NEXT: mov v20.d[1], v21.d[0] +; NO_SVE-NEXT: fcvtn v23.4h, v23.4s +; NO_SVE-NEXT: mov v22.d[1], v23.d[0] +; NO_SVE-NEXT: stp q20, q22, [x1, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v64i32_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x12, #48 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #40 +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: ptrue p1.s +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: scvtf z1.h, p1/m, z1.s +; VBITS_EQ_256-NEXT: scvtf z2.h, p1/m, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: ptrue p2.h, vl8 +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: splice z2.h, p2, z2.h, z1.h +; VBITS_EQ_256-NEXT: movprfx z1, z6 +; VBITS_EQ_256-NEXT: scvtf z1.h, p1/m, z6.s +; VBITS_EQ_256-NEXT: scvtf z5.h, p1/m, z5.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: scvtf z3.h, p1/m, z3.s +; VBITS_EQ_256-NEXT: scvtf z4.h, p1/m, z4.s +; VBITS_EQ_256-NEXT: splice z5.h, p2, z5.h, z1.h +; VBITS_EQ_256-NEXT: scvtf z0.h, p1/m, z0.s +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: scvtf z1.h, p1/m, z7.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: splice z4.h, p2, z4.h, z3.h +; VBITS_EQ_256-NEXT: splice z1.h, p2, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -1394,6 +4068,11 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i32_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.2s, v0.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i32_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.2s, v0.2s @@ -1404,6 +4083,11 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v4i32_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i32_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.4s, v0.4s @@ -1413,6 +4097,14 @@ } define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i32_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v8i32_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -1427,6 +4119,18 @@ } define void @scvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i32_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v16i32_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -1453,6 +4157,46 @@ } define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i32_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: scvtf v4.4s, v4.4s +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: scvtf v0.4s, v5.4s +; NO_SVE-NEXT: scvtf v1.4s, v6.4s +; NO_SVE-NEXT: scvtf v2.4s, v7.4s +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i32_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -1467,6 +4211,78 @@ } define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v64i32_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: scvtf v0.4s, v0.4s +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: scvtf v1.4s, v1.4s +; NO_SVE-NEXT: scvtf v2.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: scvtf v3.4s, v3.4s +; NO_SVE-NEXT: scvtf v4.4s, v4.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: scvtf v2.4s, v7.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: scvtf v0.4s, v5.4s +; NO_SVE-NEXT: scvtf v1.4s, v6.4s +; NO_SVE-NEXT: scvtf v3.4s, v16.4s +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: scvtf v4.4s, v17.4s +; NO_SVE-NEXT: scvtf v0.4s, v18.4s +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: scvtf v1.4s, v19.4s +; NO_SVE-NEXT: scvtf v2.4s, v20.4s +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: scvtf v3.4s, v21.4s +; NO_SVE-NEXT: scvtf v4.4s, v22.4s +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: scvtf v0.4s, v23.4s +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v64i32_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #56 +; VBITS_EQ_256-NEXT: mov x14, #32 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_EQ_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_EQ_256-NEXT: scvtf z5.s, p0/m, z5.s +; VBITS_EQ_256-NEXT: scvtf z4.s, p0/m, z4.s +; VBITS_EQ_256-NEXT: scvtf z6.s, p0/m, z6.s +; VBITS_EQ_256-NEXT: scvtf z7.s, p0/m, z7.s +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -1486,6 +4302,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v1i32_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v1i32_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 @@ -1498,6 +4321,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i32_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 @@ -1508,6 +4337,16 @@ } define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v4i32_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -1523,6 +4362,21 @@ } define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i32_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: sshll v2.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v3.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q3, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v8i32_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 @@ -1554,6 +4408,57 @@ } define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i32_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: sshll v6.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #0 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: sshll v7.2d, v2.2s, #0 +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: sshll2 v2.2d, v2.4s, #0 +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: scvtf v6.2d, v6.2d +; NO_SVE-NEXT: sshll v4.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v5.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: stp q6, q3, [x1, #64] +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: scvtf v2.2d, v5.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v3.2d, v4.2d +; NO_SVE-NEXT: stp q2, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v16i32_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z0.s +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z1.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -1570,6 +4475,99 @@ } define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i32_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: sshll v18.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0, #64] +; NO_SVE-NEXT: scvtf v18.2d, v18.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: sshll v19.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v0.4s, #0 +; NO_SVE-NEXT: scvtf v19.2d, v19.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: sshll v17.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #0 +; NO_SVE-NEXT: scvtf v17.2d, v17.2d +; NO_SVE-NEXT: sshll v6.2d, v3.2s, #0 +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: sshll v16.2d, v5.2s, #0 +; NO_SVE-NEXT: ldp q21, q20, [x0] +; NO_SVE-NEXT: sshll v7.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q18, q1, [x1, #192] +; NO_SVE-NEXT: sshll2 v2.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q17, q4, [x1, #160] +; NO_SVE-NEXT: sshll2 v1.2d, v3.4s, #0 +; NO_SVE-NEXT: stp q19, q0, [x1, #224] +; NO_SVE-NEXT: scvtf v6.2d, v6.2d +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v16.2d, v16.2d +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v22.2d, v21.2s, #0 +; NO_SVE-NEXT: sshll v0.2d, v20.2s, #0 +; NO_SVE-NEXT: stp q7, q2, [x1, #96] +; NO_SVE-NEXT: sshll2 v3.2d, v20.4s, #0 +; NO_SVE-NEXT: stp q6, q1, [x1, #64] +; NO_SVE-NEXT: sshll2 v4.2d, v21.4s, #0 +; NO_SVE-NEXT: scvtf v5.2d, v5.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v2.2d, v4.2d +; NO_SVE-NEXT: scvtf v1.2d, v22.2d +; NO_SVE-NEXT: stp q16, q5, [x1, #128] +; NO_SVE-NEXT: stp q0, q3, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i32_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x11, #12 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z2.s +; VBITS_EQ_256-NEXT: scvtf z4.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: scvtf z5.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: scvtf z6.d, p0/m, z6.d +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.d, z3.s +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #20 +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z7.d, p0/m, z7.d +; VBITS_EQ_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1592,6 +4590,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v1i64_v1f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: scvtf h0, x8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v1i64_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -1604,6 +4609,16 @@ ; v2f16 is not legal for NEON, so use SVE define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i64_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: mov x8, v0.d[1] +; NO_SVE-NEXT: fmov x9, d0 +; NO_SVE-NEXT: scvtf h0, x9 +; NO_SVE-NEXT: scvtf h1, x8 +; NO_SVE-NEXT: mov v0.h[1], v1.h[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i64_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 @@ -1618,6 +4633,16 @@ } define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) #0 { +; NO_SVE-LABEL: scvtf_v4i64_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v1.2d +; NO_SVE-NEXT: fcvtn v0.4h, v0.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i64_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -1634,6 +4659,23 @@ } define <8 x half> @scvtf_v8i64_v8f16(<8 x i64>* %a) #0 { +; NO_SVE-LABEL: scvtf_v8i64_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q1, q3, [x0] +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v2.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: fcvtn v2.4h, v0.4s +; NO_SVE-NEXT: fcvtn2 v1.4s, v3.2d +; NO_SVE-NEXT: fcvtn v0.4h, v1.4s +; NO_SVE-NEXT: mov v0.d[1], v2.d[0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -1667,6 +4709,68 @@ } define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i64_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #64] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #96] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q5, q4, [x0] +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: scvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v2.2d +; NO_SVE-NEXT: fcvtn v5.2s, v5.2d +; NO_SVE-NEXT: scvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn v0.4h, v3.4s +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v5.4s, v4.2d +; NO_SVE-NEXT: fcvtn v6.2s, v6.2d +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: fcvtn v3.4h, v5.4s +; NO_SVE-NEXT: fcvtn2 v6.4s, v7.2d +; NO_SVE-NEXT: fcvtn v2.4h, v6.4s +; NO_SVE-NEXT: mov v3.d[1], v2.d[0] +; NO_SVE-NEXT: stp q3, q1, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v16i64_v16f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z3.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: uzp1 z1.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: mov v2.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1685,6 +4789,122 @@ } define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i64_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q2, q5, [x0, #96] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q6, q4, [x0, #160] +; NO_SVE-NEXT: scvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v2.2s, v2.2d +; NO_SVE-NEXT: scvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn v1.4h, v1.4s +; NO_SVE-NEXT: ldp q16, q7, [x0, #224] +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v2.4s, v5.2d +; NO_SVE-NEXT: fcvtn v6.2s, v6.2d +; NO_SVE-NEXT: scvtf v16.2d, v16.2d +; NO_SVE-NEXT: fcvtn v2.4h, v2.4s +; NO_SVE-NEXT: ldp q18, q17, [x0, #32] +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v6.4s, v4.2d +; NO_SVE-NEXT: fcvtn v16.2s, v16.2d +; NO_SVE-NEXT: scvtf v18.2d, v18.2d +; NO_SVE-NEXT: fcvtn v0.4h, v6.4s +; NO_SVE-NEXT: ldp q19, q3, [x0, #64] +; NO_SVE-NEXT: scvtf v17.2d, v17.2d +; NO_SVE-NEXT: fcvtn2 v16.4s, v7.2d +; NO_SVE-NEXT: fcvtn v18.2s, v18.2d +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: scvtf v19.2d, v19.2d +; NO_SVE-NEXT: ldp q21, q20, [x0, #192] +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v18.4s, v17.2d +; NO_SVE-NEXT: fcvtn v19.2s, v19.2d +; NO_SVE-NEXT: scvtf v21.2d, v21.2d +; NO_SVE-NEXT: fcvtn v5.4h, v18.4s +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: scvtf v20.2d, v20.2d +; NO_SVE-NEXT: fcvtn2 v19.4s, v3.2d +; NO_SVE-NEXT: fcvtn v21.2s, v21.2d +; NO_SVE-NEXT: fcvtn v3.4h, v16.4s +; NO_SVE-NEXT: scvtf v23.2d, v23.2d +; NO_SVE-NEXT: fcvtn v7.4h, v19.4s +; NO_SVE-NEXT: scvtf v22.2d, v22.2d +; NO_SVE-NEXT: fcvtn2 v21.4s, v20.2d +; NO_SVE-NEXT: fcvtn v23.2s, v23.2d +; NO_SVE-NEXT: mov v7.d[1], v2.d[0] +; NO_SVE-NEXT: fcvtn v4.4h, v21.4s +; NO_SVE-NEXT: fcvtn2 v23.4s, v22.2d +; NO_SVE-NEXT: mov v4.d[1], v3.d[0] +; NO_SVE-NEXT: fcvtn v6.4h, v23.4s +; NO_SVE-NEXT: stp q1, q4, [x1, #32] +; NO_SVE-NEXT: mov v6.d[1], v5.d[0] +; NO_SVE-NEXT: stp q6, q7, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i64_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: mov x11, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x13, #20 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.d +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v1.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: movprfx z2, z6 +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z6.d +; VBITS_EQ_256-NEXT: scvtf z5.h, p0/m, z5.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: scvtf z3.h, p0/m, z3.d +; VBITS_EQ_256-NEXT: mov v5.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: movprfx z2, z4 +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z4.d +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_EQ_256-NEXT: movprfx z2, z7 +; VBITS_EQ_256-NEXT: scvtf z2.h, p0/m, z7.d +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: mov v2.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z5.h, p0, z5.h, z1.h +; VBITS_EQ_256-NEXT: splice z2.h, p0, z2.h, z3.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1708,6 +4928,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v1i64_v1f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v1i64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -1720,6 +4947,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i64_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i64_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.2d, v0.2d @@ -1730,6 +4963,15 @@ } define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) #0 { +; NO_SVE-LABEL: scvtf_v4i64_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v1.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i64_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -1745,6 +4987,21 @@ } define void @scvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i64_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q1, q3, [x0] +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v0.2s, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: fcvtn2 v0.4s, v2.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v3.2d +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -1779,6 +5036,59 @@ } define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i64_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #64] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q2, q5, [x0, #96] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q3, q6, [x0] +; NO_SVE-NEXT: scvtf v5.2d, v5.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v2.2s, v2.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q4, q7, [x0, #32] +; NO_SVE-NEXT: scvtf v6.2d, v6.2d +; NO_SVE-NEXT: fcvtn2 v2.4s, v5.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: stp q1, q2, [x1, #32] +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v6.2d +; NO_SVE-NEXT: fcvtn v4.2s, v4.2d +; NO_SVE-NEXT: fcvtn2 v4.4s, v7.2d +; NO_SVE-NEXT: stp q3, q4, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v16i64_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ptrue p1.d +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: scvtf z0.s, p1/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z2.s, p1/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z1.s, p1/m, z1.d +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: scvtf z3.s, p1/m, z3.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: splice z2.s, p0, z2.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z0.s, p0, z0.s, z1.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1796,6 +5106,103 @@ } define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i64_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #192] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: ldp q5, q4, [x0, #224] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: fcvtn v1.2s, v1.2d +; NO_SVE-NEXT: scvtf v5.2d, v5.2d +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: fcvtn2 v1.4s, v0.2d +; NO_SVE-NEXT: fcvtn v5.2s, v5.2d +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: fcvtn2 v5.4s, v4.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: fcvtn v3.2s, v3.2d +; NO_SVE-NEXT: scvtf v7.2d, v7.2d +; NO_SVE-NEXT: ldp q17, q16, [x0] +; NO_SVE-NEXT: scvtf v0.2d, v6.2d +; NO_SVE-NEXT: fcvtn2 v3.4s, v2.2d +; NO_SVE-NEXT: fcvtn v7.2s, v7.2d +; NO_SVE-NEXT: scvtf v17.2d, v17.2d +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: scvtf v4.2d, v16.2d +; NO_SVE-NEXT: fcvtn2 v7.4s, v0.2d +; NO_SVE-NEXT: fcvtn v17.2s, v17.2d +; NO_SVE-NEXT: scvtf v19.2d, v19.2d +; NO_SVE-NEXT: ldp q21, q20, [x0, #128] +; NO_SVE-NEXT: scvtf v18.2d, v18.2d +; NO_SVE-NEXT: fcvtn2 v17.4s, v4.2d +; NO_SVE-NEXT: fcvtn v19.2s, v19.2d +; NO_SVE-NEXT: scvtf v21.2d, v21.2d +; NO_SVE-NEXT: ldp q23, q22, [x0, #160] +; NO_SVE-NEXT: scvtf v20.2d, v20.2d +; NO_SVE-NEXT: fcvtn2 v19.4s, v18.2d +; NO_SVE-NEXT: fcvtn v21.2s, v21.2d +; NO_SVE-NEXT: stp q17, q3, [x1] +; NO_SVE-NEXT: stp q1, q5, [x1, #96] +; NO_SVE-NEXT: scvtf v23.2d, v23.2d +; NO_SVE-NEXT: stp q19, q7, [x1, #32] +; NO_SVE-NEXT: scvtf v22.2d, v22.2d +; NO_SVE-NEXT: fcvtn2 v21.4s, v20.2d +; NO_SVE-NEXT: fcvtn v23.2s, v23.2d +; NO_SVE-NEXT: fcvtn2 v23.4s, v22.2d +; NO_SVE-NEXT: stp q21, q23, [x1, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i64_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x11, #8 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #20 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: scvtf z1.s, p1/m, z1.d +; VBITS_EQ_256-NEXT: scvtf z2.s, p1/m, z2.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: ptrue p2.s, vl4 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: splice z2.s, p2, z2.s, z1.s +; VBITS_EQ_256-NEXT: movprfx z1, z6 +; VBITS_EQ_256-NEXT: scvtf z1.s, p1/m, z6.d +; VBITS_EQ_256-NEXT: scvtf z5.s, p1/m, z5.d +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: scvtf z3.s, p1/m, z3.d +; VBITS_EQ_256-NEXT: scvtf z4.s, p1/m, z4.d +; VBITS_EQ_256-NEXT: splice z5.s, p2, z5.s, z1.s +; VBITS_EQ_256-NEXT: scvtf z0.s, p1/m, z0.d +; VBITS_EQ_256-NEXT: movprfx z1, z7 +; VBITS_EQ_256-NEXT: scvtf z1.s, p1/m, z7.d +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: splice z4.s, p2, z4.s, z3.s +; VBITS_EQ_256-NEXT: splice z1.s, p2, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1818,6 +5225,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v1i64_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: scvtf d0, x8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v1i64_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -1830,6 +5244,11 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 { +; NO_SVE-LABEL: scvtf_v2i64_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v2i64_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: scvtf v0.2d, v0.2d @@ -1839,6 +5258,14 @@ } define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v4i64_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: stp q0, q1, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: scvtf_v4i64_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -1853,6 +5280,18 @@ } define void @scvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v8i64_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -1879,6 +5318,46 @@ } define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v16i64_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #96] +; NO_SVE-NEXT: scvtf v0.2d, v5.2d +; NO_SVE-NEXT: scvtf v1.2d, v6.2d +; NO_SVE-NEXT: scvtf v2.2d, v7.2d +; NO_SVE-NEXT: stp q4, q0, [x1, #32] +; NO_SVE-NEXT: stp q1, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v16i64_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1893,6 +5372,78 @@ } define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: scvtf_v32i64_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: scvtf v0.2d, v0.2d +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: scvtf v1.2d, v1.2d +; NO_SVE-NEXT: scvtf v2.2d, v2.2d +; NO_SVE-NEXT: ldp q4, q5, [x0, #160] +; NO_SVE-NEXT: scvtf v3.2d, v3.2d +; NO_SVE-NEXT: scvtf v4.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x0, #128] +; NO_SVE-NEXT: ldp q16, q17, [x0, #96] +; NO_SVE-NEXT: ldp q18, q19, [x0, #64] +; NO_SVE-NEXT: ldp q20, q21, [x0, #32] +; NO_SVE-NEXT: ldp q22, q23, [x0] +; NO_SVE-NEXT: stp q2, q3, [x1, #192] +; NO_SVE-NEXT: scvtf v2.2d, v7.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #224] +; NO_SVE-NEXT: scvtf v0.2d, v5.2d +; NO_SVE-NEXT: scvtf v1.2d, v6.2d +; NO_SVE-NEXT: scvtf v3.2d, v16.2d +; NO_SVE-NEXT: stp q4, q0, [x1, #160] +; NO_SVE-NEXT: scvtf v4.2d, v17.2d +; NO_SVE-NEXT: scvtf v0.2d, v18.2d +; NO_SVE-NEXT: stp q1, q2, [x1, #128] +; NO_SVE-NEXT: scvtf v1.2d, v19.2d +; NO_SVE-NEXT: scvtf v2.2d, v20.2d +; NO_SVE-NEXT: stp q3, q4, [x1, #96] +; NO_SVE-NEXT: scvtf v3.2d, v21.2d +; NO_SVE-NEXT: scvtf v4.2d, v22.2d +; NO_SVE-NEXT: stp q0, q1, [x1, #64] +; NO_SVE-NEXT: scvtf v0.2d, v23.2d +; NO_SVE-NEXT: stp q2, q3, [x1, #32] +; NO_SVE-NEXT: stp q4, q0, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: scvtf_v32i64_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_EQ_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_EQ_256-NEXT: scvtf z5.d, p0/m, z5.d +; VBITS_EQ_256-NEXT: scvtf z4.d, p0/m, z4.d +; VBITS_EQ_256-NEXT: scvtf z6.d, p0/m, z6.d +; VBITS_EQ_256-NEXT: scvtf z7.d, p0/m, z7.d +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 @@ -22,6 +23,13 @@ ; Don't use SVE for 64-bit vectors. define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.8b, v2.8b, #7 +; NO_SVE-NEXT: cmlt v2.8b, v2.8b, #0 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.8b, v2.8b, #7 @@ -34,6 +42,13 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.16b, v2.16b, #7 +; NO_SVE-NEXT: cmlt v2.16b, v2.16b, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.16b, v2.16b, #7 @@ -45,6 +60,81 @@ } define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, <32 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldrh w9, [x2, #2] +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: sbfx w10, w8, #0, #1 +; NO_SVE-NEXT: sbfx w11, w9, #0, #1 +; NO_SVE-NEXT: fmov s0, w10 +; NO_SVE-NEXT: sbfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s1, w11 +; NO_SVE-NEXT: sbfx w11, w9, #1, #1 +; NO_SVE-NEXT: ldp q4, q5, [x1] +; NO_SVE-NEXT: mov v0.b[1], w10 +; NO_SVE-NEXT: sbfx w10, w8, #2, #1 +; NO_SVE-NEXT: mov v1.b[1], w11 +; NO_SVE-NEXT: sbfx w11, w9, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w10 +; NO_SVE-NEXT: sbfx w10, w8, #3, #1 +; NO_SVE-NEXT: mov v1.b[2], w11 +; NO_SVE-NEXT: sbfx w11, w9, #3, #1 +; NO_SVE-NEXT: mov v0.b[3], w10 +; NO_SVE-NEXT: sbfx w10, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[3], w11 +; NO_SVE-NEXT: sbfx w11, w9, #4, #1 +; NO_SVE-NEXT: mov v0.b[4], w10 +; NO_SVE-NEXT: sbfx w10, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[4], w11 +; NO_SVE-NEXT: sbfx w11, w9, #5, #1 +; NO_SVE-NEXT: mov v0.b[5], w10 +; NO_SVE-NEXT: sbfx w10, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[5], w11 +; NO_SVE-NEXT: sbfx w11, w9, #6, #1 +; NO_SVE-NEXT: mov v0.b[6], w10 +; NO_SVE-NEXT: sbfx w10, w8, #7, #1 +; NO_SVE-NEXT: mov v1.b[6], w11 +; NO_SVE-NEXT: sbfx w11, w9, #7, #1 +; NO_SVE-NEXT: mov v0.b[7], w10 +; NO_SVE-NEXT: sbfx w10, w8, #8, #1 +; NO_SVE-NEXT: mov v1.b[7], w11 +; NO_SVE-NEXT: sbfx w11, w9, #8, #1 +; NO_SVE-NEXT: mov v0.b[8], w10 +; NO_SVE-NEXT: sbfx w10, w8, #9, #1 +; NO_SVE-NEXT: mov v1.b[8], w11 +; NO_SVE-NEXT: sbfx w11, w9, #9, #1 +; NO_SVE-NEXT: mov v0.b[9], w10 +; NO_SVE-NEXT: sbfx w10, w8, #10, #1 +; NO_SVE-NEXT: mov v1.b[9], w11 +; NO_SVE-NEXT: sbfx w11, w9, #10, #1 +; NO_SVE-NEXT: mov v0.b[10], w10 +; NO_SVE-NEXT: sbfx w10, w8, #11, #1 +; NO_SVE-NEXT: mov v1.b[10], w11 +; NO_SVE-NEXT: sbfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v0.b[11], w10 +; NO_SVE-NEXT: sbfx w10, w8, #12, #1 +; NO_SVE-NEXT: mov v1.b[11], w11 +; NO_SVE-NEXT: sbfx w11, w9, #12, #1 +; NO_SVE-NEXT: mov v0.b[12], w10 +; NO_SVE-NEXT: sbfx w10, w8, #13, #1 +; NO_SVE-NEXT: mov v1.b[12], w11 +; NO_SVE-NEXT: sbfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v0.b[13], w10 +; NO_SVE-NEXT: sbfx w10, w8, #14, #1 +; NO_SVE-NEXT: mov v1.b[13], w11 +; NO_SVE-NEXT: sbfx w11, w9, #14, #1 +; NO_SVE-NEXT: sbfx w8, w8, #15, #1 +; NO_SVE-NEXT: sbfx w9, w9, #15, #1 +; NO_SVE-NEXT: mov v0.b[14], w10 +; NO_SVE-NEXT: mov v1.b[14], w11 +; NO_SVE-NEXT: mov v0.b[15], w8 +; NO_SVE-NEXT: mov v1.b[15], w9 +; NO_SVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bsl v1.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -53,8 +143,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldr w8, [x2] ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.b @@ -130,7 +220,7 @@ ; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 @@ -145,6 +235,152 @@ } define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, <64 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2, #4] +; NO_SVE-NEXT: ldrh w9, [x2, #6] +; NO_SVE-NEXT: ldrh w10, [x2] +; NO_SVE-NEXT: ldrh w11, [x2, #2] +; NO_SVE-NEXT: sbfx w12, w8, #0, #1 +; NO_SVE-NEXT: sbfx w13, w9, #0, #1 +; NO_SVE-NEXT: sbfx w14, w10, #0, #1 +; NO_SVE-NEXT: fmov s0, w12 +; NO_SVE-NEXT: sbfx w12, w11, #0, #1 +; NO_SVE-NEXT: fmov s1, w13 +; NO_SVE-NEXT: sbfx w13, w8, #1, #1 +; NO_SVE-NEXT: fmov s2, w14 +; NO_SVE-NEXT: sbfx w14, w9, #1, #1 +; NO_SVE-NEXT: fmov s3, w12 +; NO_SVE-NEXT: sbfx w12, w10, #1, #1 +; NO_SVE-NEXT: mov v0.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w11, #1, #1 +; NO_SVE-NEXT: mov v1.b[1], w14 +; NO_SVE-NEXT: sbfx w14, w8, #2, #1 +; NO_SVE-NEXT: mov v2.b[1], w12 +; NO_SVE-NEXT: sbfx w12, w9, #2, #1 +; NO_SVE-NEXT: mov v3.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w10, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w11, #2, #1 +; NO_SVE-NEXT: mov v1.b[2], w12 +; NO_SVE-NEXT: sbfx w12, w8, #3, #1 +; NO_SVE-NEXT: mov v2.b[2], w13 +; NO_SVE-NEXT: sbfx w13, w9, #3, #1 +; NO_SVE-NEXT: mov v3.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w10, #3, #1 +; NO_SVE-NEXT: mov v0.b[3], w12 +; NO_SVE-NEXT: sbfx w12, w11, #3, #1 +; NO_SVE-NEXT: mov v1.b[3], w13 +; NO_SVE-NEXT: sbfx w13, w8, #4, #1 +; NO_SVE-NEXT: mov v2.b[3], w14 +; NO_SVE-NEXT: sbfx w14, w9, #4, #1 +; NO_SVE-NEXT: mov v3.b[3], w12 +; NO_SVE-NEXT: sbfx w12, w10, #4, #1 +; NO_SVE-NEXT: mov v0.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w11, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w8, #5, #1 +; NO_SVE-NEXT: mov v2.b[4], w12 +; NO_SVE-NEXT: sbfx w12, w9, #5, #1 +; NO_SVE-NEXT: mov v3.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w10, #5, #1 +; NO_SVE-NEXT: mov v0.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w11, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w12 +; NO_SVE-NEXT: sbfx w12, w8, #6, #1 +; NO_SVE-NEXT: mov v2.b[5], w13 +; NO_SVE-NEXT: sbfx w13, w9, #6, #1 +; NO_SVE-NEXT: mov v3.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w10, #6, #1 +; NO_SVE-NEXT: mov v0.b[6], w12 +; NO_SVE-NEXT: sbfx w12, w11, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w13 +; NO_SVE-NEXT: sbfx w13, w8, #7, #1 +; NO_SVE-NEXT: mov v2.b[6], w14 +; NO_SVE-NEXT: sbfx w14, w9, #7, #1 +; NO_SVE-NEXT: mov v3.b[6], w12 +; NO_SVE-NEXT: sbfx w12, w10, #7, #1 +; NO_SVE-NEXT: mov v0.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w11, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w8, #8, #1 +; NO_SVE-NEXT: mov v2.b[7], w12 +; NO_SVE-NEXT: sbfx w12, w9, #8, #1 +; NO_SVE-NEXT: mov v3.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w10, #8, #1 +; NO_SVE-NEXT: mov v0.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w11, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w12 +; NO_SVE-NEXT: sbfx w12, w8, #9, #1 +; NO_SVE-NEXT: mov v2.b[8], w13 +; NO_SVE-NEXT: sbfx w13, w9, #9, #1 +; NO_SVE-NEXT: mov v3.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w10, #9, #1 +; NO_SVE-NEXT: mov v0.b[9], w12 +; NO_SVE-NEXT: sbfx w12, w11, #9, #1 +; NO_SVE-NEXT: mov v1.b[9], w13 +; NO_SVE-NEXT: sbfx w13, w8, #10, #1 +; NO_SVE-NEXT: mov v2.b[9], w14 +; NO_SVE-NEXT: sbfx w14, w9, #10, #1 +; NO_SVE-NEXT: mov v3.b[9], w12 +; NO_SVE-NEXT: sbfx w12, w10, #10, #1 +; NO_SVE-NEXT: mov v0.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w11, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w8, #11, #1 +; NO_SVE-NEXT: mov v2.b[10], w12 +; NO_SVE-NEXT: sbfx w12, w9, #11, #1 +; NO_SVE-NEXT: mov v3.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w10, #11, #1 +; NO_SVE-NEXT: mov v0.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w11, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w12 +; NO_SVE-NEXT: sbfx w12, w8, #12, #1 +; NO_SVE-NEXT: mov v2.b[11], w13 +; NO_SVE-NEXT: sbfx w13, w9, #12, #1 +; NO_SVE-NEXT: mov v3.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w10, #12, #1 +; NO_SVE-NEXT: mov v0.b[12], w12 +; NO_SVE-NEXT: sbfx w12, w11, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w13 +; NO_SVE-NEXT: sbfx w13, w8, #13, #1 +; NO_SVE-NEXT: mov v2.b[12], w14 +; NO_SVE-NEXT: sbfx w14, w9, #13, #1 +; NO_SVE-NEXT: mov v3.b[12], w12 +; NO_SVE-NEXT: sbfx w12, w10, #13, #1 +; NO_SVE-NEXT: mov v0.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w11, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w14 +; NO_SVE-NEXT: sbfx w14, w8, #14, #1 +; NO_SVE-NEXT: mov v2.b[13], w12 +; NO_SVE-NEXT: sbfx w12, w9, #14, #1 +; NO_SVE-NEXT: mov v3.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w10, #14, #1 +; NO_SVE-NEXT: mov v0.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w11, #14, #1 +; NO_SVE-NEXT: mov v1.b[14], w12 +; NO_SVE-NEXT: sbfx w8, w8, #15, #1 +; NO_SVE-NEXT: mov v2.b[14], w13 +; NO_SVE-NEXT: sbfx w9, w9, #15, #1 +; NO_SVE-NEXT: mov v3.b[14], w14 +; NO_SVE-NEXT: sbfx w10, w10, #15, #1 +; NO_SVE-NEXT: sbfx w11, w11, #15, #1 +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: mov v0.b[15], w8 +; NO_SVE-NEXT: mov v1.b[15], w9 +; NO_SVE-NEXT: mov v2.b[15], w10 +; NO_SVE-NEXT: mov v3.b[15], w11 +; NO_SVE-NEXT: ldp q6, q7, [x0] +; NO_SVE-NEXT: ldp q16, q17, [x1, #32] +; NO_SVE-NEXT: bsl v0.16b, v4.16b, v16.16b +; NO_SVE-NEXT: ldp q18, q19, [x1] +; NO_SVE-NEXT: bsl v1.16b, v5.16b, v17.16b +; NO_SVE-NEXT: bsl v2.16b, v6.16b, v18.16b +; NO_SVE-NEXT: stp q0, q1, [x0, #32] +; NO_SVE-NEXT: bsl v3.16b, v7.16b, v19.16b +; NO_SVE-NEXT: stp q2, q3, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -153,8 +389,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldr x8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ptrue p1.b @@ -309,6 +545,297 @@ } define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, <128 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v128i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2, #14] +; NO_SVE-NEXT: ldrh w9, [x2, #12] +; NO_SVE-NEXT: ldrh w11, [x2, #10] +; NO_SVE-NEXT: ldrh w12, [x2, #8] +; NO_SVE-NEXT: sbfx w13, w8, #0, #1 +; NO_SVE-NEXT: ldrh w10, [x2, #6] +; NO_SVE-NEXT: sbfx w14, w9, #0, #1 +; NO_SVE-NEXT: sbfx w15, w11, #0, #1 +; NO_SVE-NEXT: fmov s0, w13 +; NO_SVE-NEXT: sbfx w13, w12, #0, #1 +; NO_SVE-NEXT: fmov s1, w14 +; NO_SVE-NEXT: sbfx w14, w10, #0, #1 +; NO_SVE-NEXT: fmov s2, w15 +; NO_SVE-NEXT: sbfx w15, w8, #1, #1 +; NO_SVE-NEXT: fmov s3, w13 +; NO_SVE-NEXT: sbfx w13, w9, #1, #1 +; NO_SVE-NEXT: fmov s4, w14 +; NO_SVE-NEXT: sbfx w14, w11, #1, #1 +; NO_SVE-NEXT: mov v0.b[1], w15 +; NO_SVE-NEXT: sbfx w15, w12, #1, #1 +; NO_SVE-NEXT: mov v1.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w10, #1, #1 +; NO_SVE-NEXT: mov v2.b[1], w14 +; NO_SVE-NEXT: sbfx w14, w8, #2, #1 +; NO_SVE-NEXT: mov v3.b[1], w15 +; NO_SVE-NEXT: sbfx w15, w9, #2, #1 +; NO_SVE-NEXT: mov v4.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w11, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w12, #2, #1 +; NO_SVE-NEXT: mov v1.b[2], w15 +; NO_SVE-NEXT: sbfx w15, w8, #3, #1 +; NO_SVE-NEXT: mov v2.b[2], w13 +; NO_SVE-NEXT: sbfx w13, w9, #3, #1 +; NO_SVE-NEXT: mov v3.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w10, #2, #1 +; NO_SVE-NEXT: mov v0.b[3], w15 +; NO_SVE-NEXT: sbfx w15, w11, #3, #1 +; NO_SVE-NEXT: mov v1.b[3], w13 +; NO_SVE-NEXT: sbfx w13, w8, #4, #1 +; NO_SVE-NEXT: mov v4.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w9, #4, #1 +; NO_SVE-NEXT: mov v2.b[3], w15 +; NO_SVE-NEXT: sbfx w15, w12, #3, #1 +; NO_SVE-NEXT: mov v0.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w11, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w8, #5, #1 +; NO_SVE-NEXT: mov v3.b[3], w15 +; NO_SVE-NEXT: sbfx w15, w9, #5, #1 +; NO_SVE-NEXT: mov v2.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w12, #4, #1 +; NO_SVE-NEXT: mov v0.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w11, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w15 +; NO_SVE-NEXT: sbfx w15, w8, #6, #1 +; NO_SVE-NEXT: mov v3.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w9, #6, #1 +; NO_SVE-NEXT: mov v2.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w12, #5, #1 +; NO_SVE-NEXT: mov v0.b[6], w15 +; NO_SVE-NEXT: sbfx w15, w11, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w13 +; NO_SVE-NEXT: sbfx w13, w8, #7, #1 +; NO_SVE-NEXT: mov v3.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w9, #7, #1 +; NO_SVE-NEXT: mov v2.b[6], w15 +; NO_SVE-NEXT: sbfx w15, w12, #6, #1 +; NO_SVE-NEXT: mov v0.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w11, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w8, #8, #1 +; NO_SVE-NEXT: mov v3.b[6], w15 +; NO_SVE-NEXT: sbfx w15, w9, #8, #1 +; NO_SVE-NEXT: mov v2.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w12, #7, #1 +; NO_SVE-NEXT: mov v0.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w11, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w15 +; NO_SVE-NEXT: sbfx w15, w8, #9, #1 +; NO_SVE-NEXT: mov v3.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w9, #9, #1 +; NO_SVE-NEXT: mov v2.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w12, #8, #1 +; NO_SVE-NEXT: mov v0.b[9], w15 +; NO_SVE-NEXT: sbfx w15, w11, #9, #1 +; NO_SVE-NEXT: mov v1.b[9], w13 +; NO_SVE-NEXT: sbfx w13, w8, #10, #1 +; NO_SVE-NEXT: mov v3.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w9, #10, #1 +; NO_SVE-NEXT: mov v2.b[9], w15 +; NO_SVE-NEXT: sbfx w15, w12, #9, #1 +; NO_SVE-NEXT: mov v0.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w11, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w8, #11, #1 +; NO_SVE-NEXT: mov v3.b[9], w15 +; NO_SVE-NEXT: sbfx w15, w9, #11, #1 +; NO_SVE-NEXT: mov v2.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w12, #10, #1 +; NO_SVE-NEXT: mov v0.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w11, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w15 +; NO_SVE-NEXT: sbfx w15, w8, #12, #1 +; NO_SVE-NEXT: mov v3.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w9, #12, #1 +; NO_SVE-NEXT: mov v2.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w12, #11, #1 +; NO_SVE-NEXT: mov v0.b[12], w15 +; NO_SVE-NEXT: sbfx w15, w11, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w13 +; NO_SVE-NEXT: sbfx w13, w8, #13, #1 +; NO_SVE-NEXT: mov v3.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w9, #13, #1 +; NO_SVE-NEXT: mov v2.b[12], w15 +; NO_SVE-NEXT: sbfx w15, w12, #12, #1 +; NO_SVE-NEXT: mov v0.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w11, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w14 +; NO_SVE-NEXT: sbfx w14, w8, #14, #1 +; NO_SVE-NEXT: mov v3.b[12], w15 +; NO_SVE-NEXT: sbfx w15, w9, #14, #1 +; NO_SVE-NEXT: mov v2.b[13], w13 +; NO_SVE-NEXT: ldrh w13, [x2, #4] +; NO_SVE-NEXT: mov v0.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w11, #14, #1 +; NO_SVE-NEXT: mov v1.b[14], w15 +; NO_SVE-NEXT: sbfx w8, w8, #15, #1 +; NO_SVE-NEXT: sbfx w9, w9, #15, #1 +; NO_SVE-NEXT: sbfx w11, w11, #15, #1 +; NO_SVE-NEXT: mov v2.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w12, #13, #1 +; NO_SVE-NEXT: mov v0.b[15], w8 +; NO_SVE-NEXT: sbfx w8, w10, #3, #1 +; NO_SVE-NEXT: mov v1.b[15], w9 +; NO_SVE-NEXT: sbfx w9, w13, #0, #1 +; NO_SVE-NEXT: mov v3.b[13], w14 +; NO_SVE-NEXT: sbfx w14, w12, #14, #1 +; NO_SVE-NEXT: mov v4.b[3], w8 +; NO_SVE-NEXT: ldrh w8, [x2, #2] +; NO_SVE-NEXT: fmov s5, w9 +; NO_SVE-NEXT: sbfx w9, w13, #1, #1 +; NO_SVE-NEXT: mov v2.b[15], w11 +; NO_SVE-NEXT: sbfx w11, w10, #4, #1 +; NO_SVE-NEXT: mov v3.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w8, #0, #1 +; NO_SVE-NEXT: mov v5.b[1], w9 +; NO_SVE-NEXT: sbfx w12, w12, #15, #1 +; NO_SVE-NEXT: mov v4.b[4], w11 +; NO_SVE-NEXT: sbfx w11, w13, #2, #1 +; NO_SVE-NEXT: fmov s6, w14 +; NO_SVE-NEXT: sbfx w14, w8, #1, #1 +; NO_SVE-NEXT: ldrh w9, [x2] +; NO_SVE-NEXT: mov v5.b[2], w11 +; NO_SVE-NEXT: sbfx w11, w10, #5, #1 +; NO_SVE-NEXT: mov v3.b[15], w12 +; NO_SVE-NEXT: sbfx w12, w13, #3, #1 +; NO_SVE-NEXT: mov v6.b[1], w14 +; NO_SVE-NEXT: sbfx w14, w9, #0, #1 +; NO_SVE-NEXT: mov v4.b[5], w11 +; NO_SVE-NEXT: sbfx w11, w8, #2, #1 +; NO_SVE-NEXT: mov v5.b[3], w12 +; NO_SVE-NEXT: sbfx w12, w10, #6, #1 +; NO_SVE-NEXT: fmov s7, w14 +; NO_SVE-NEXT: sbfx w14, w9, #1, #1 +; NO_SVE-NEXT: mov v6.b[2], w11 +; NO_SVE-NEXT: sbfx w11, w13, #4, #1 +; NO_SVE-NEXT: mov v4.b[6], w12 +; NO_SVE-NEXT: sbfx w12, w8, #3, #1 +; NO_SVE-NEXT: mov v7.b[1], w14 +; NO_SVE-NEXT: sbfx w14, w10, #7, #1 +; NO_SVE-NEXT: mov v5.b[4], w11 +; NO_SVE-NEXT: sbfx w11, w9, #2, #1 +; NO_SVE-NEXT: mov v6.b[3], w12 +; NO_SVE-NEXT: sbfx w12, w13, #5, #1 +; NO_SVE-NEXT: mov v4.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w8, #4, #1 +; NO_SVE-NEXT: mov v7.b[2], w11 +; NO_SVE-NEXT: sbfx w11, w10, #8, #1 +; NO_SVE-NEXT: mov v5.b[5], w12 +; NO_SVE-NEXT: sbfx w12, w9, #3, #1 +; NO_SVE-NEXT: mov v6.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w13, #6, #1 +; NO_SVE-NEXT: mov v4.b[8], w11 +; NO_SVE-NEXT: sbfx w11, w8, #5, #1 +; NO_SVE-NEXT: mov v7.b[3], w12 +; NO_SVE-NEXT: sbfx w12, w10, #9, #1 +; NO_SVE-NEXT: mov v5.b[6], w14 +; NO_SVE-NEXT: sbfx w14, w9, #4, #1 +; NO_SVE-NEXT: mov v6.b[5], w11 +; NO_SVE-NEXT: sbfx w11, w13, #7, #1 +; NO_SVE-NEXT: mov v4.b[9], w12 +; NO_SVE-NEXT: sbfx w12, w8, #6, #1 +; NO_SVE-NEXT: mov v7.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w10, #10, #1 +; NO_SVE-NEXT: mov v5.b[7], w11 +; NO_SVE-NEXT: sbfx w11, w9, #5, #1 +; NO_SVE-NEXT: mov v6.b[6], w12 +; NO_SVE-NEXT: sbfx w12, w13, #8, #1 +; NO_SVE-NEXT: mov v4.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w8, #7, #1 +; NO_SVE-NEXT: mov v7.b[5], w11 +; NO_SVE-NEXT: sbfx w11, w10, #11, #1 +; NO_SVE-NEXT: mov v5.b[8], w12 +; NO_SVE-NEXT: sbfx w12, w9, #6, #1 +; NO_SVE-NEXT: mov v6.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w13, #9, #1 +; NO_SVE-NEXT: mov v4.b[11], w11 +; NO_SVE-NEXT: sbfx w11, w8, #8, #1 +; NO_SVE-NEXT: mov v7.b[6], w12 +; NO_SVE-NEXT: sbfx w12, w10, #12, #1 +; NO_SVE-NEXT: mov v5.b[9], w14 +; NO_SVE-NEXT: sbfx w14, w9, #7, #1 +; NO_SVE-NEXT: mov v6.b[8], w11 +; NO_SVE-NEXT: sbfx w11, w13, #10, #1 +; NO_SVE-NEXT: mov v4.b[12], w12 +; NO_SVE-NEXT: sbfx w12, w8, #9, #1 +; NO_SVE-NEXT: mov v7.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w10, #13, #1 +; NO_SVE-NEXT: mov v5.b[10], w11 +; NO_SVE-NEXT: sbfx w11, w9, #8, #1 +; NO_SVE-NEXT: mov v6.b[9], w12 +; NO_SVE-NEXT: sbfx w12, w13, #11, #1 +; NO_SVE-NEXT: mov v4.b[13], w14 +; NO_SVE-NEXT: sbfx w14, w8, #10, #1 +; NO_SVE-NEXT: mov v7.b[8], w11 +; NO_SVE-NEXT: sbfx w11, w10, #14, #1 +; NO_SVE-NEXT: mov v5.b[11], w12 +; NO_SVE-NEXT: sbfx w12, w9, #9, #1 +; NO_SVE-NEXT: mov v6.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w13, #12, #1 +; NO_SVE-NEXT: mov v4.b[14], w11 +; NO_SVE-NEXT: sbfx w11, w8, #11, #1 +; NO_SVE-NEXT: mov v7.b[9], w12 +; NO_SVE-NEXT: sbfx w12, w9, #10, #1 +; NO_SVE-NEXT: mov v5.b[12], w14 +; NO_SVE-NEXT: sbfx w10, w10, #15, #1 +; NO_SVE-NEXT: mov v6.b[11], w11 +; NO_SVE-NEXT: sbfx w11, w13, #13, #1 +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: mov v7.b[10], w12 +; NO_SVE-NEXT: sbfx w12, w8, #13, #1 +; NO_SVE-NEXT: mov v5.b[13], w11 +; NO_SVE-NEXT: sbfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v4.b[15], w10 +; NO_SVE-NEXT: sbfx w10, w8, #12, #1 +; NO_SVE-NEXT: mov v7.b[11], w11 +; NO_SVE-NEXT: sbfx w11, w9, #12, #1 +; NO_SVE-NEXT: mov v6.b[12], w10 +; NO_SVE-NEXT: sbfx w10, w13, #14, #1 +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: mov v7.b[12], w11 +; NO_SVE-NEXT: sbfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v6.b[13], w12 +; NO_SVE-NEXT: sbfx w12, w8, #14, #1 +; NO_SVE-NEXT: mov v5.b[14], w10 +; NO_SVE-NEXT: sbfx w10, w9, #14, #1 +; NO_SVE-NEXT: sbfx w8, w8, #15, #1 +; NO_SVE-NEXT: sbfx w9, w9, #15, #1 +; NO_SVE-NEXT: mov v7.b[13], w11 +; NO_SVE-NEXT: sbfx w11, w13, #15, #1 +; NO_SVE-NEXT: ldp q25, q24, [x1, #96] +; NO_SVE-NEXT: mov v6.b[14], w12 +; NO_SVE-NEXT: mov v5.b[15], w11 +; NO_SVE-NEXT: mov v7.b[14], w10 +; NO_SVE-NEXT: bsl v1.16b, v17.16b, v25.16b +; NO_SVE-NEXT: mov v6.b[15], w8 +; NO_SVE-NEXT: ldp q27, q26, [x1, #64] +; NO_SVE-NEXT: mov v7.b[15], w9 +; NO_SVE-NEXT: bsl v0.16b, v16.16b, v24.16b +; NO_SVE-NEXT: bsl v3.16b, v19.16b, v27.16b +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: bsl v2.16b, v18.16b, v26.16b +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: ldp q29, q28, [x1, #32] +; NO_SVE-NEXT: ldp q31, q30, [x1] +; NO_SVE-NEXT: stp q3, q2, [x0, #64] +; NO_SVE-NEXT: stp q1, q0, [x0, #96] +; NO_SVE-NEXT: mov v0.16b, v5.16b +; NO_SVE-NEXT: mov v1.16b, v6.16b +; NO_SVE-NEXT: mov v2.16b, v7.16b +; NO_SVE-NEXT: bsl v4.16b, v20.16b, v28.16b +; NO_SVE-NEXT: bsl v0.16b, v21.16b, v29.16b +; NO_SVE-NEXT: bsl v1.16b, v22.16b, v30.16b +; NO_SVE-NEXT: bsl v2.16b, v23.16b, v31.16b +; NO_SVE-NEXT: stp q0, q4, [x0, #32] +; NO_SVE-NEXT: stp q2, q1, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -317,8 +844,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr x8, [x2, #8] ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 ; VBITS_GE_1024-NEXT: ptrue p1.b @@ -602,6 +1129,579 @@ } define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, <256 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v256i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w11, [x2, #30] +; NO_SVE-NEXT: ldrh w12, [x2, #26] +; NO_SVE-NEXT: ldrh w9, [x2, #24] +; NO_SVE-NEXT: sbfx w14, w11, #0, #1 +; NO_SVE-NEXT: ldrh w8, [x2, #28] +; NO_SVE-NEXT: sbfx w4, w12, #0, #1 +; NO_SVE-NEXT: ldrh w10, [x2, #22] +; NO_SVE-NEXT: sbfx w13, w11, #1, #1 +; NO_SVE-NEXT: sbfx w15, w11, #2, #1 +; NO_SVE-NEXT: fmov s1, w14 +; NO_SVE-NEXT: sbfx w14, w9, #0, #1 +; NO_SVE-NEXT: sbfx w18, w8, #0, #1 +; NO_SVE-NEXT: fmov s2, w4 +; NO_SVE-NEXT: sbfx w4, w12, #1, #1 +; NO_SVE-NEXT: sbfx w16, w11, #3, #1 +; NO_SVE-NEXT: fmov s3, w14 +; NO_SVE-NEXT: sbfx w14, w10, #0, #1 +; NO_SVE-NEXT: mov v1.b[1], w13 +; NO_SVE-NEXT: fmov s0, w18 +; NO_SVE-NEXT: sbfx w13, w8, #1, #1 +; NO_SVE-NEXT: sbfx w17, w11, #4, #1 +; NO_SVE-NEXT: mov v2.b[1], w4 +; NO_SVE-NEXT: fmov s4, w14 +; NO_SVE-NEXT: sbfx w14, w12, #2, #1 +; NO_SVE-NEXT: sbfx w3, w11, #5, #1 +; NO_SVE-NEXT: mov v0.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w9, #1, #1 +; NO_SVE-NEXT: mov v1.b[2], w15 +; NO_SVE-NEXT: sbfx w15, w10, #1, #1 +; NO_SVE-NEXT: mov v2.b[2], w14 +; NO_SVE-NEXT: sbfx w18, w11, #6, #1 +; NO_SVE-NEXT: mov v3.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w12, #3, #1 +; NO_SVE-NEXT: mov v4.b[1], w15 +; NO_SVE-NEXT: sbfx w15, w8, #2, #1 +; NO_SVE-NEXT: mov v1.b[3], w16 +; NO_SVE-NEXT: sbfx w16, w9, #2, #1 +; NO_SVE-NEXT: mov v2.b[3], w13 +; NO_SVE-NEXT: sbfx w4, w11, #7, #1 +; NO_SVE-NEXT: mov v0.b[2], w15 +; NO_SVE-NEXT: sbfx w15, w12, #4, #1 +; NO_SVE-NEXT: mov v3.b[2], w16 +; NO_SVE-NEXT: sbfx w16, w10, #2, #1 +; NO_SVE-NEXT: mov v1.b[4], w17 +; NO_SVE-NEXT: sbfx w17, w8, #3, #1 +; NO_SVE-NEXT: mov v2.b[4], w15 +; NO_SVE-NEXT: sbfx w14, w11, #8, #1 +; NO_SVE-NEXT: mov v4.b[2], w16 +; NO_SVE-NEXT: sbfx w16, w12, #5, #1 +; NO_SVE-NEXT: mov v0.b[3], w17 +; NO_SVE-NEXT: sbfx w17, w9, #3, #1 +; NO_SVE-NEXT: mov v1.b[5], w3 +; NO_SVE-NEXT: sbfx w3, w10, #3, #1 +; NO_SVE-NEXT: mov v2.b[5], w16 +; NO_SVE-NEXT: sbfx w13, w11, #9, #1 +; NO_SVE-NEXT: mov v3.b[3], w17 +; NO_SVE-NEXT: sbfx w17, w12, #6, #1 +; NO_SVE-NEXT: mov v4.b[3], w3 +; NO_SVE-NEXT: sbfx w3, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[6], w18 +; NO_SVE-NEXT: sbfx w18, w9, #4, #1 +; NO_SVE-NEXT: mov v2.b[6], w17 +; NO_SVE-NEXT: sbfx w15, w11, #10, #1 +; NO_SVE-NEXT: mov v0.b[4], w3 +; NO_SVE-NEXT: sbfx w3, w12, #7, #1 +; NO_SVE-NEXT: mov v3.b[4], w18 +; NO_SVE-NEXT: sbfx w18, w10, #4, #1 +; NO_SVE-NEXT: mov v1.b[7], w4 +; NO_SVE-NEXT: sbfx w4, w8, #5, #1 +; NO_SVE-NEXT: mov v2.b[7], w3 +; NO_SVE-NEXT: sbfx w16, w11, #11, #1 +; NO_SVE-NEXT: mov v4.b[4], w18 +; NO_SVE-NEXT: sbfx w18, w12, #8, #1 +; NO_SVE-NEXT: mov v0.b[5], w4 +; NO_SVE-NEXT: sbfx w4, w9, #5, #1 +; NO_SVE-NEXT: mov v1.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w10, #5, #1 +; NO_SVE-NEXT: mov v2.b[8], w18 +; NO_SVE-NEXT: sbfx w17, w11, #12, #1 +; NO_SVE-NEXT: sbfx w3, w11, #13, #1 +; NO_SVE-NEXT: sbfx w18, w11, #14, #1 +; NO_SVE-NEXT: sbfx w5, w11, #15, #1 +; NO_SVE-NEXT: sbfx w11, w12, #9, #1 +; NO_SVE-NEXT: mov v3.b[5], w4 +; NO_SVE-NEXT: sbfx w4, w9, #9, #1 +; NO_SVE-NEXT: mov v4.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[9], w13 +; NO_SVE-NEXT: sbfx w13, w9, #6, #1 +; NO_SVE-NEXT: mov v2.b[9], w11 +; NO_SVE-NEXT: sbfx w11, w8, #7, #1 +; NO_SVE-NEXT: mov v0.b[6], w14 +; NO_SVE-NEXT: sbfx w14, w12, #10, #1 +; NO_SVE-NEXT: mov v3.b[6], w13 +; NO_SVE-NEXT: sbfx w13, w10, #6, #1 +; NO_SVE-NEXT: mov v1.b[10], w15 +; NO_SVE-NEXT: sbfx w15, w9, #7, #1 +; NO_SVE-NEXT: mov v2.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w8, #8, #1 +; NO_SVE-NEXT: mov v4.b[6], w13 +; NO_SVE-NEXT: sbfx w13, w12, #11, #1 +; NO_SVE-NEXT: mov v3.b[7], w15 +; NO_SVE-NEXT: sbfx w15, w12, #12, #1 +; NO_SVE-NEXT: mov v1.b[11], w16 +; NO_SVE-NEXT: sbfx w16, w9, #8, #1 +; NO_SVE-NEXT: mov v2.b[11], w13 +; NO_SVE-NEXT: sbfx w13, w12, #13, #1 +; NO_SVE-NEXT: mov v0.b[7], w11 +; NO_SVE-NEXT: ldrh w11, [x2, #20] +; NO_SVE-NEXT: mov v3.b[8], w16 +; NO_SVE-NEXT: sbfx w16, w9, #10, #1 +; NO_SVE-NEXT: mov v1.b[12], w17 +; NO_SVE-NEXT: mov v2.b[12], w15 +; NO_SVE-NEXT: sbfx w15, w10, #7, #1 +; NO_SVE-NEXT: mov v0.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w11, #0, #1 +; NO_SVE-NEXT: mov v3.b[9], w4 +; NO_SVE-NEXT: sbfx w17, w11, #8, #1 +; NO_SVE-NEXT: mov v4.b[7], w15 +; NO_SVE-NEXT: sbfx w15, w10, #8, #1 +; NO_SVE-NEXT: mov v2.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w11, #1, #1 +; NO_SVE-NEXT: fmov s5, w14 +; NO_SVE-NEXT: sbfx w14, w9, #11, #1 +; NO_SVE-NEXT: mov v3.b[10], w16 +; NO_SVE-NEXT: sbfx w16, w11, #2, #1 +; NO_SVE-NEXT: mov v4.b[8], w15 +; NO_SVE-NEXT: sbfx w15, w10, #9, #1 +; NO_SVE-NEXT: mov v5.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w12, #14, #1 +; NO_SVE-NEXT: sbfx w12, w12, #15, #1 +; NO_SVE-NEXT: mov v3.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w9, #12, #1 +; NO_SVE-NEXT: mov v2.b[14], w13 +; NO_SVE-NEXT: sbfx w13, w11, #3, #1 +; NO_SVE-NEXT: mov v5.b[2], w16 +; NO_SVE-NEXT: mov v4.b[9], w15 +; NO_SVE-NEXT: sbfx w15, w11, #4, #1 +; NO_SVE-NEXT: mov v3.b[12], w14 +; NO_SVE-NEXT: sbfx w14, w10, #10, #1 +; NO_SVE-NEXT: mov v2.b[15], w12 +; NO_SVE-NEXT: sbfx w12, w11, #6, #1 +; NO_SVE-NEXT: mov v5.b[3], w13 +; NO_SVE-NEXT: sbfx w13, w9, #13, #1 +; NO_SVE-NEXT: mov v4.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w10, #11, #1 +; NO_SVE-NEXT: mov v1.b[13], w3 +; NO_SVE-NEXT: ldrh w3, [x2, #6] +; NO_SVE-NEXT: mov v3.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w11, #5, #1 +; NO_SVE-NEXT: mov v5.b[4], w15 +; NO_SVE-NEXT: sbfx w15, w10, #12, #1 +; NO_SVE-NEXT: mov v4.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w9, #14, #1 +; NO_SVE-NEXT: mov v1.b[14], w18 +; NO_SVE-NEXT: ldp q21, q25, [x0, #32] +; NO_SVE-NEXT: mov v5.b[5], w13 +; NO_SVE-NEXT: sbfx w13, w11, #7, #1 +; NO_SVE-NEXT: mov v4.b[12], w15 +; NO_SVE-NEXT: mov v3.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w10, #13, #1 +; NO_SVE-NEXT: mov v1.b[15], w5 +; NO_SVE-NEXT: mov v5.b[6], w12 +; NO_SVE-NEXT: mov v4.b[13], w14 +; NO_SVE-NEXT: sbfx w14, w10, #14, #1 +; NO_SVE-NEXT: ldrh w15, [x2, #14] +; NO_SVE-NEXT: sbfx w10, w10, #15, #1 +; NO_SVE-NEXT: ldrh w12, [x2, #18] +; NO_SVE-NEXT: mov v5.b[7], w13 +; NO_SVE-NEXT: ldrh w13, [x2, #16] +; NO_SVE-NEXT: mov v4.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w15, #0, #1 +; NO_SVE-NEXT: sbfx w16, w12, #0, #1 +; NO_SVE-NEXT: sbfx w18, w12, #1, #1 +; NO_SVE-NEXT: ldp q22, q26, [x1, #32] +; NO_SVE-NEXT: mov v5.b[8], w17 +; NO_SVE-NEXT: sbfx w17, w15, #1, #1 +; NO_SVE-NEXT: fmov s16, w14 +; NO_SVE-NEXT: sbfx w14, w12, #2, #1 +; NO_SVE-NEXT: fmov s6, w16 +; NO_SVE-NEXT: sbfx w16, w13, #0, #1 +; NO_SVE-NEXT: mov v4.b[15], w10 +; NO_SVE-NEXT: mov v16.b[1], w17 +; NO_SVE-NEXT: sbfx w17, w15, #2, #1 +; NO_SVE-NEXT: mov v6.b[1], w18 +; NO_SVE-NEXT: sbfx w18, w13, #1, #1 +; NO_SVE-NEXT: fmov s7, w16 +; NO_SVE-NEXT: sbfx w16, w11, #9, #1 +; NO_SVE-NEXT: ldp q24, q23, [x0, #64] +; NO_SVE-NEXT: mov v16.b[2], w17 +; NO_SVE-NEXT: sbfx w17, w13, #3, #1 +; NO_SVE-NEXT: mov v7.b[1], w18 +; NO_SVE-NEXT: sbfx w18, w13, #2, #1 +; NO_SVE-NEXT: mov v6.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w12, #3, #1 +; NO_SVE-NEXT: mov v5.b[9], w16 +; NO_SVE-NEXT: sbfx w16, w15, #3, #1 +; NO_SVE-NEXT: mov v7.b[2], w18 +; NO_SVE-NEXT: sbfx w18, w15, #4, #1 +; NO_SVE-NEXT: mov v16.b[3], w16 +; NO_SVE-NEXT: sbfx w16, w11, #10, #1 +; NO_SVE-NEXT: mov v6.b[3], w14 +; NO_SVE-NEXT: sbfx w14, w12, #4, #1 +; NO_SVE-NEXT: mov v7.b[3], w17 +; NO_SVE-NEXT: sbfx w17, w13, #4, #1 +; NO_SVE-NEXT: mov v16.b[4], w18 +; NO_SVE-NEXT: sbfx w18, w15, #6, #1 +; NO_SVE-NEXT: mov v6.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w12, #5, #1 +; NO_SVE-NEXT: mov v5.b[10], w16 +; NO_SVE-NEXT: sbfx w16, w15, #5, #1 +; NO_SVE-NEXT: mov v7.b[4], w17 +; NO_SVE-NEXT: sbfx w17, w13, #5, #1 +; NO_SVE-NEXT: mov v16.b[5], w16 +; NO_SVE-NEXT: sbfx w16, w11, #11, #1 +; NO_SVE-NEXT: mov v6.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w12, #6, #1 +; NO_SVE-NEXT: mov v7.b[5], w17 +; NO_SVE-NEXT: sbfx w17, w13, #6, #1 +; NO_SVE-NEXT: mov v5.b[11], w16 +; NO_SVE-NEXT: sbfx w16, w15, #7, #1 +; NO_SVE-NEXT: mov v16.b[6], w18 +; NO_SVE-NEXT: sbfx w18, w15, #8, #1 +; NO_SVE-NEXT: mov v6.b[6], w14 +; NO_SVE-NEXT: sbfx w14, w12, #7, #1 +; NO_SVE-NEXT: mov v7.b[6], w17 +; NO_SVE-NEXT: sbfx w17, w13, #7, #1 +; NO_SVE-NEXT: mov v16.b[7], w16 +; NO_SVE-NEXT: sbfx w16, w11, #12, #1 +; NO_SVE-NEXT: mov v6.b[7], w14 +; NO_SVE-NEXT: sbfx w14, w12, #8, #1 +; NO_SVE-NEXT: mov v7.b[7], w17 +; NO_SVE-NEXT: sbfx w17, w13, #8, #1 +; NO_SVE-NEXT: mov v5.b[12], w16 +; NO_SVE-NEXT: sbfx w16, w15, #9, #1 +; NO_SVE-NEXT: mov v16.b[8], w18 +; NO_SVE-NEXT: sbfx w18, w15, #10, #1 +; NO_SVE-NEXT: mov v6.b[8], w14 +; NO_SVE-NEXT: sbfx w14, w12, #9, #1 +; NO_SVE-NEXT: mov v7.b[8], w17 +; NO_SVE-NEXT: sbfx w17, w13, #9, #1 +; NO_SVE-NEXT: mov v16.b[9], w16 +; NO_SVE-NEXT: sbfx w16, w11, #13, #1 +; NO_SVE-NEXT: mov v6.b[9], w14 +; NO_SVE-NEXT: sbfx w14, w12, #10, #1 +; NO_SVE-NEXT: mov v7.b[9], w17 +; NO_SVE-NEXT: sbfx w17, w13, #10, #1 +; NO_SVE-NEXT: mov v5.b[13], w16 +; NO_SVE-NEXT: sbfx w16, w15, #11, #1 +; NO_SVE-NEXT: mov v16.b[10], w18 +; NO_SVE-NEXT: sbfx w18, w15, #12, #1 +; NO_SVE-NEXT: mov v6.b[10], w14 +; NO_SVE-NEXT: sbfx w14, w12, #11, #1 +; NO_SVE-NEXT: mov v7.b[10], w17 +; NO_SVE-NEXT: sbfx w17, w13, #11, #1 +; NO_SVE-NEXT: mov v16.b[11], w16 +; NO_SVE-NEXT: sbfx w16, w11, #14, #1 +; NO_SVE-NEXT: mov v6.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w12, #12, #1 +; NO_SVE-NEXT: mov v7.b[11], w17 +; NO_SVE-NEXT: sbfx w17, w13, #12, #1 +; NO_SVE-NEXT: mov v5.b[14], w16 +; NO_SVE-NEXT: sbfx w16, w15, #13, #1 +; NO_SVE-NEXT: mov v16.b[12], w18 +; NO_SVE-NEXT: sbfx w18, w12, #14, #1 +; NO_SVE-NEXT: mov v6.b[12], w14 +; NO_SVE-NEXT: sbfx w14, w12, #13, #1 +; NO_SVE-NEXT: mov v7.b[12], w17 +; NO_SVE-NEXT: sbfx w17, w13, #13, #1 +; NO_SVE-NEXT: sbfx w12, w12, #15, #1 +; NO_SVE-NEXT: mov v16.b[13], w16 +; NO_SVE-NEXT: sbfx w16, w15, #14, #1 +; NO_SVE-NEXT: mov v6.b[13], w14 +; NO_SVE-NEXT: ldrh w14, [x2, #12] +; NO_SVE-NEXT: mov v7.b[13], w17 +; NO_SVE-NEXT: sbfx w17, w13, #14, #1 +; NO_SVE-NEXT: mov v16.b[14], w16 +; NO_SVE-NEXT: sbfx w16, w14, #0, #1 +; NO_SVE-NEXT: mov v6.b[14], w18 +; NO_SVE-NEXT: sbfx w18, w15, #15, #1 +; NO_SVE-NEXT: ldrh w15, [x2, #8] +; NO_SVE-NEXT: sbfx w4, w14, #1, #1 +; NO_SVE-NEXT: fmov s17, w16 +; NO_SVE-NEXT: ldrh w16, [x2, #10] +; NO_SVE-NEXT: mov v7.b[14], w17 +; NO_SVE-NEXT: sbfx w17, w15, #0, #1 +; NO_SVE-NEXT: mov v16.b[15], w18 +; NO_SVE-NEXT: sbfx w18, w3, #0, #1 +; NO_SVE-NEXT: mov v17.b[1], w4 +; NO_SVE-NEXT: sbfx w4, w16, #0, #1 +; NO_SVE-NEXT: fmov s19, w17 +; NO_SVE-NEXT: sbfx w17, w15, #1, #1 +; NO_SVE-NEXT: fmov s20, w18 +; NO_SVE-NEXT: sbfx w18, w3, #1, #1 +; NO_SVE-NEXT: fmov s18, w4 +; NO_SVE-NEXT: sbfx w4, w14, #2, #1 +; NO_SVE-NEXT: mov v19.b[1], w17 +; NO_SVE-NEXT: sbfx w17, w16, #1, #1 +; NO_SVE-NEXT: mov v20.b[1], w18 +; NO_SVE-NEXT: sbfx w18, w15, #2, #1 +; NO_SVE-NEXT: mov v17.b[2], w4 +; NO_SVE-NEXT: sbfx w4, w3, #2, #1 +; NO_SVE-NEXT: mov v18.b[1], w17 +; NO_SVE-NEXT: sbfx w17, w14, #3, #1 +; NO_SVE-NEXT: mov v19.b[2], w18 +; NO_SVE-NEXT: sbfx w18, w16, #2, #1 +; NO_SVE-NEXT: mov v20.b[2], w4 +; NO_SVE-NEXT: sbfx w4, w15, #3, #1 +; NO_SVE-NEXT: mov v17.b[3], w17 +; NO_SVE-NEXT: sbfx w17, w3, #3, #1 +; NO_SVE-NEXT: mov v18.b[2], w18 +; NO_SVE-NEXT: sbfx w18, w14, #4, #1 +; NO_SVE-NEXT: mov v19.b[3], w4 +; NO_SVE-NEXT: sbfx w4, w16, #3, #1 +; NO_SVE-NEXT: mov v20.b[3], w17 +; NO_SVE-NEXT: sbfx w17, w15, #4, #1 +; NO_SVE-NEXT: mov v17.b[4], w18 +; NO_SVE-NEXT: sbfx w18, w3, #4, #1 +; NO_SVE-NEXT: mov v18.b[3], w4 +; NO_SVE-NEXT: sbfx w4, w14, #5, #1 +; NO_SVE-NEXT: mov v19.b[4], w17 +; NO_SVE-NEXT: sbfx w17, w16, #4, #1 +; NO_SVE-NEXT: mov v20.b[4], w18 +; NO_SVE-NEXT: sbfx w18, w15, #5, #1 +; NO_SVE-NEXT: mov v17.b[5], w4 +; NO_SVE-NEXT: sbfx w4, w3, #5, #1 +; NO_SVE-NEXT: mov v18.b[4], w17 +; NO_SVE-NEXT: sbfx w17, w14, #6, #1 +; NO_SVE-NEXT: mov v19.b[5], w18 +; NO_SVE-NEXT: sbfx w18, w16, #5, #1 +; NO_SVE-NEXT: mov v20.b[5], w4 +; NO_SVE-NEXT: sbfx w4, w15, #6, #1 +; NO_SVE-NEXT: mov v17.b[6], w17 +; NO_SVE-NEXT: sbfx w17, w3, #6, #1 +; NO_SVE-NEXT: mov v18.b[5], w18 +; NO_SVE-NEXT: sbfx w18, w14, #7, #1 +; NO_SVE-NEXT: mov v19.b[6], w4 +; NO_SVE-NEXT: sbfx w4, w16, #6, #1 +; NO_SVE-NEXT: mov v20.b[6], w17 +; NO_SVE-NEXT: sbfx w17, w15, #7, #1 +; NO_SVE-NEXT: mov v17.b[7], w18 +; NO_SVE-NEXT: sbfx w18, w3, #7, #1 +; NO_SVE-NEXT: mov v18.b[6], w4 +; NO_SVE-NEXT: sbfx w4, w14, #8, #1 +; NO_SVE-NEXT: mov v19.b[7], w17 +; NO_SVE-NEXT: sbfx w17, w16, #7, #1 +; NO_SVE-NEXT: mov v20.b[7], w18 +; NO_SVE-NEXT: sbfx w18, w15, #8, #1 +; NO_SVE-NEXT: mov v17.b[8], w4 +; NO_SVE-NEXT: sbfx w4, w3, #8, #1 +; NO_SVE-NEXT: mov v18.b[7], w17 +; NO_SVE-NEXT: sbfx w17, w14, #9, #1 +; NO_SVE-NEXT: mov v19.b[8], w18 +; NO_SVE-NEXT: sbfx w18, w16, #8, #1 +; NO_SVE-NEXT: mov v20.b[8], w4 +; NO_SVE-NEXT: sbfx w4, w15, #9, #1 +; NO_SVE-NEXT: mov v17.b[9], w17 +; NO_SVE-NEXT: sbfx w17, w3, #9, #1 +; NO_SVE-NEXT: mov v18.b[8], w18 +; NO_SVE-NEXT: sbfx w18, w14, #10, #1 +; NO_SVE-NEXT: mov v19.b[9], w4 +; NO_SVE-NEXT: sbfx w4, w16, #9, #1 +; NO_SVE-NEXT: mov v20.b[9], w17 +; NO_SVE-NEXT: sbfx w17, w15, #10, #1 +; NO_SVE-NEXT: mov v17.b[10], w18 +; NO_SVE-NEXT: sbfx w18, w3, #10, #1 +; NO_SVE-NEXT: mov v18.b[9], w4 +; NO_SVE-NEXT: sbfx w4, w14, #11, #1 +; NO_SVE-NEXT: mov v19.b[10], w17 +; NO_SVE-NEXT: sbfx w17, w16, #10, #1 +; NO_SVE-NEXT: mov v20.b[10], w18 +; NO_SVE-NEXT: sbfx w18, w15, #11, #1 +; NO_SVE-NEXT: mov v17.b[11], w4 +; NO_SVE-NEXT: sbfx w4, w3, #11, #1 +; NO_SVE-NEXT: mov v18.b[10], w17 +; NO_SVE-NEXT: sbfx w17, w14, #12, #1 +; NO_SVE-NEXT: mov v19.b[11], w18 +; NO_SVE-NEXT: sbfx w18, w16, #11, #1 +; NO_SVE-NEXT: mov v20.b[11], w4 +; NO_SVE-NEXT: sbfx w4, w15, #12, #1 +; NO_SVE-NEXT: mov v17.b[12], w17 +; NO_SVE-NEXT: sbfx w17, w3, #12, #1 +; NO_SVE-NEXT: mov v18.b[11], w18 +; NO_SVE-NEXT: sbfx w18, w14, #13, #1 +; NO_SVE-NEXT: mov v19.b[12], w4 +; NO_SVE-NEXT: sbfx w4, w16, #12, #1 +; NO_SVE-NEXT: mov v20.b[12], w17 +; NO_SVE-NEXT: sbfx w17, w15, #13, #1 +; NO_SVE-NEXT: mov v17.b[13], w18 +; NO_SVE-NEXT: sbfx w18, w3, #13, #1 +; NO_SVE-NEXT: mov v18.b[12], w4 +; NO_SVE-NEXT: sbfx w4, w14, #14, #1 +; NO_SVE-NEXT: mov v19.b[13], w17 +; NO_SVE-NEXT: sbfx w17, w16, #13, #1 +; NO_SVE-NEXT: mov v20.b[13], w18 +; NO_SVE-NEXT: sbfx w18, w15, #14, #1 +; NO_SVE-NEXT: mov v17.b[14], w4 +; NO_SVE-NEXT: sbfx w4, w3, #14, #1 +; NO_SVE-NEXT: mov v18.b[13], w17 +; NO_SVE-NEXT: sbfx w17, w3, #15, #1 +; NO_SVE-NEXT: mov v19.b[14], w18 +; NO_SVE-NEXT: sbfx w18, w16, #14, #1 +; NO_SVE-NEXT: mov v20.b[14], w4 +; NO_SVE-NEXT: sbfx w15, w15, #15, #1 +; NO_SVE-NEXT: sbfx w14, w14, #15, #1 +; NO_SVE-NEXT: mov v18.b[14], w18 +; NO_SVE-NEXT: mov v19.b[15], w15 +; NO_SVE-NEXT: sbfx w15, w16, #15, #1 +; NO_SVE-NEXT: mov v20.b[15], w17 +; NO_SVE-NEXT: mov v17.b[15], w14 +; NO_SVE-NEXT: sbfx w14, w9, #15, #1 +; NO_SVE-NEXT: mov v18.b[15], w15 +; NO_SVE-NEXT: sbfx w9, w11, #15, #1 +; NO_SVE-NEXT: sbfx w11, w13, #15, #1 +; NO_SVE-NEXT: bsl v20.16b, v25.16b, v26.16b +; NO_SVE-NEXT: ldp q25, q27, [x1, #64] +; NO_SVE-NEXT: mov v7.b[15], w11 +; NO_SVE-NEXT: mov v6.b[15], w12 +; NO_SVE-NEXT: mov v5.b[15], w9 +; NO_SVE-NEXT: bsl v19.16b, v24.16b, v25.16b +; NO_SVE-NEXT: mov v3.b[15], w14 +; NO_SVE-NEXT: ldp q24, q26, [x0, #96] +; NO_SVE-NEXT: bsl v18.16b, v23.16b, v27.16b +; NO_SVE-NEXT: ldp q23, q25, [x1, #96] +; NO_SVE-NEXT: bsl v17.16b, v24.16b, v23.16b +; NO_SVE-NEXT: ldp q23, q27, [x0, #128] +; NO_SVE-NEXT: bsl v16.16b, v26.16b, v25.16b +; NO_SVE-NEXT: ldp q25, q24, [x1, #128] +; NO_SVE-NEXT: bsl v7.16b, v23.16b, v25.16b +; NO_SVE-NEXT: ldp q23, q26, [x0, #160] +; NO_SVE-NEXT: bsl v6.16b, v27.16b, v24.16b +; NO_SVE-NEXT: ldp q24, q25, [x1, #160] +; NO_SVE-NEXT: bsl v5.16b, v23.16b, v24.16b +; NO_SVE-NEXT: ldp q23, q27, [x0, #192] +; NO_SVE-NEXT: bit v25.16b, v26.16b, v4.16b +; NO_SVE-NEXT: ldp q26, q24, [x1, #192] +; NO_SVE-NEXT: bsl v3.16b, v23.16b, v26.16b +; NO_SVE-NEXT: ldrh w9, [x2, #4] +; NO_SVE-NEXT: ldrh w12, [x2, #2] +; NO_SVE-NEXT: bsl v2.16b, v27.16b, v24.16b +; NO_SVE-NEXT: sbfx w10, w9, #0, #1 +; NO_SVE-NEXT: sbfx w11, w9, #1, #1 +; NO_SVE-NEXT: ldp q24, q26, [x0, #224] +; NO_SVE-NEXT: sbfx w13, w12, #0, #1 +; NO_SVE-NEXT: fmov s4, w10 +; NO_SVE-NEXT: mov v4.b[1], w11 +; NO_SVE-NEXT: sbfx w11, w8, #9, #1 +; NO_SVE-NEXT: ldp q27, q23, [x1, #224] +; NO_SVE-NEXT: mov v0.b[9], w11 +; NO_SVE-NEXT: ldrh w10, [x2] +; NO_SVE-NEXT: bsl v1.16b, v26.16b, v23.16b +; NO_SVE-NEXT: fmov s23, w13 +; NO_SVE-NEXT: sbfx w13, w12, #1, #1 +; NO_SVE-NEXT: sbfx w14, w10, #0, #1 +; NO_SVE-NEXT: sbfx w11, w10, #2, #1 +; NO_SVE-NEXT: mov v23.b[1], w13 +; NO_SVE-NEXT: sbfx w13, w9, #2, #1 +; NO_SVE-NEXT: fmov s26, w14 +; NO_SVE-NEXT: sbfx w14, w10, #1, #1 +; NO_SVE-NEXT: mov v4.b[2], w13 +; NO_SVE-NEXT: sbfx w13, w8, #10, #1 +; NO_SVE-NEXT: mov v26.b[1], w14 +; NO_SVE-NEXT: sbfx w14, w12, #2, #1 +; NO_SVE-NEXT: mov v0.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w10, #3, #1 +; NO_SVE-NEXT: mov v23.b[2], w14 +; NO_SVE-NEXT: sbfx w14, w9, #3, #1 +; NO_SVE-NEXT: mov v26.b[2], w11 +; NO_SVE-NEXT: sbfx w11, w12, #3, #1 +; NO_SVE-NEXT: mov v4.b[3], w14 +; NO_SVE-NEXT: sbfx w14, w8, #11, #1 +; NO_SVE-NEXT: mov v23.b[3], w11 +; NO_SVE-NEXT: sbfx w11, w9, #4, #1 +; NO_SVE-NEXT: mov v26.b[3], w13 +; NO_SVE-NEXT: sbfx w13, w12, #4, #1 +; NO_SVE-NEXT: mov v0.b[11], w14 +; NO_SVE-NEXT: sbfx w14, w10, #4, #1 +; NO_SVE-NEXT: mov v4.b[4], w11 +; NO_SVE-NEXT: sbfx w11, w8, #12, #1 +; NO_SVE-NEXT: mov v23.b[4], w13 +; NO_SVE-NEXT: sbfx w13, w9, #5, #1 +; NO_SVE-NEXT: mov v26.b[4], w14 +; NO_SVE-NEXT: sbfx w14, w12, #5, #1 +; NO_SVE-NEXT: mov v0.b[12], w11 +; NO_SVE-NEXT: sbfx w11, w10, #5, #1 +; NO_SVE-NEXT: mov v4.b[5], w13 +; NO_SVE-NEXT: sbfx w13, w8, #13, #1 +; NO_SVE-NEXT: mov v23.b[5], w14 +; NO_SVE-NEXT: sbfx w14, w9, #6, #1 +; NO_SVE-NEXT: mov v26.b[5], w11 +; NO_SVE-NEXT: sbfx w11, w12, #6, #1 +; NO_SVE-NEXT: mov v0.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w10, #6, #1 +; NO_SVE-NEXT: mov v4.b[6], w14 +; NO_SVE-NEXT: sbfx w14, w8, #14, #1 +; NO_SVE-NEXT: mov v23.b[6], w11 +; NO_SVE-NEXT: sbfx w11, w9, #7, #1 +; NO_SVE-NEXT: mov v26.b[6], w13 +; NO_SVE-NEXT: sbfx w13, w12, #7, #1 +; NO_SVE-NEXT: mov v0.b[14], w14 +; NO_SVE-NEXT: sbfx w14, w10, #7, #1 +; NO_SVE-NEXT: sbfx w8, w8, #15, #1 +; NO_SVE-NEXT: mov v4.b[7], w11 +; NO_SVE-NEXT: sbfx w11, w9, #8, #1 +; NO_SVE-NEXT: mov v23.b[7], w13 +; NO_SVE-NEXT: sbfx w13, w12, #8, #1 +; NO_SVE-NEXT: mov v26.b[7], w14 +; NO_SVE-NEXT: mov v0.b[15], w8 +; NO_SVE-NEXT: sbfx w8, w10, #8, #1 +; NO_SVE-NEXT: mov v4.b[8], w11 +; NO_SVE-NEXT: sbfx w11, w9, #9, #1 +; NO_SVE-NEXT: mov v23.b[8], w13 +; NO_SVE-NEXT: sbfx w13, w10, #9, #1 +; NO_SVE-NEXT: mov v26.b[8], w8 +; NO_SVE-NEXT: sbfx w8, w12, #9, #1 +; NO_SVE-NEXT: bsl v0.16b, v24.16b, v27.16b +; NO_SVE-NEXT: mov v4.b[9], w11 +; NO_SVE-NEXT: sbfx w11, w9, #10, #1 +; NO_SVE-NEXT: mov v23.b[9], w8 +; NO_SVE-NEXT: sbfx w8, w12, #10, #1 +; NO_SVE-NEXT: mov v26.b[9], w13 +; NO_SVE-NEXT: sbfx w13, w10, #10, #1 +; NO_SVE-NEXT: ldp q27, q24, [x1] +; NO_SVE-NEXT: mov v4.b[10], w11 +; NO_SVE-NEXT: sbfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v23.b[10], w8 +; NO_SVE-NEXT: sbfx w8, w12, #11, #1 +; NO_SVE-NEXT: mov v26.b[10], w13 +; NO_SVE-NEXT: sbfx w13, w10, #11, #1 +; NO_SVE-NEXT: stp q19, q18, [x0, #64] +; NO_SVE-NEXT: mov v4.b[11], w11 +; NO_SVE-NEXT: sbfx w11, w9, #12, #1 +; NO_SVE-NEXT: mov v23.b[11], w8 +; NO_SVE-NEXT: sbfx w8, w12, #12, #1 +; NO_SVE-NEXT: mov v26.b[11], w13 +; NO_SVE-NEXT: sbfx w13, w10, #12, #1 +; NO_SVE-NEXT: stp q17, q16, [x0, #96] +; NO_SVE-NEXT: mov v4.b[12], w11 +; NO_SVE-NEXT: sbfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v23.b[12], w8 +; NO_SVE-NEXT: sbfx w8, w12, #13, #1 +; NO_SVE-NEXT: mov v26.b[12], w13 +; NO_SVE-NEXT: sbfx w13, w10, #13, #1 +; NO_SVE-NEXT: stp q7, q6, [x0, #128] +; NO_SVE-NEXT: mov v4.b[13], w11 +; NO_SVE-NEXT: sbfx w11, w9, #14, #1 +; NO_SVE-NEXT: mov v23.b[13], w8 +; NO_SVE-NEXT: sbfx w8, w12, #14, #1 +; NO_SVE-NEXT: mov v26.b[13], w13 +; NO_SVE-NEXT: sbfx w13, w10, #14, #1 +; NO_SVE-NEXT: sbfx w9, w9, #15, #1 +; NO_SVE-NEXT: sbfx w10, w10, #15, #1 +; NO_SVE-NEXT: mov v4.b[14], w11 +; NO_SVE-NEXT: stp q5, q25, [x0, #160] +; NO_SVE-NEXT: mov v23.b[14], w8 +; NO_SVE-NEXT: sbfx w8, w12, #15, #1 +; NO_SVE-NEXT: mov v26.b[14], w13 +; NO_SVE-NEXT: stp q3, q2, [x0, #192] +; NO_SVE-NEXT: stp q0, q1, [x0, #224] +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: mov v4.b[15], w9 +; NO_SVE-NEXT: mov v23.b[15], w8 +; NO_SVE-NEXT: mov v26.b[15], w10 +; NO_SVE-NEXT: mov v2.16b, v4.16b +; NO_SVE-NEXT: bsl v2.16b, v21.16b, v22.16b +; NO_SVE-NEXT: bif v0.16b, v24.16b, v23.16b +; NO_SVE-NEXT: bif v1.16b, v27.16b, v26.16b +; NO_SVE-NEXT: stp q2, q20, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -610,8 +1710,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2, #24] ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 ; VBITS_GE_2048-NEXT: ptrue p1.b @@ -1154,6 +2254,13 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.4h, v2.4h, #15 +; NO_SVE-NEXT: cmlt v2.4h, v2.4h, #0 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.4h, v2.4h, #15 @@ -1166,6 +2273,14 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: shl v2.8h, v2.8h, #15 +; NO_SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 @@ -1178,6 +2293,54 @@ } define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, <16 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: ubfx w10, w8, #8, #1 +; NO_SVE-NEXT: ubfx w11, w8, #1, #1 +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: ubfx w9, w8, #9, #1 +; NO_SVE-NEXT: fmov s1, w10 +; NO_SVE-NEXT: ubfx w10, w8, #2, #1 +; NO_SVE-NEXT: ldp q4, q5, [x1] +; NO_SVE-NEXT: mov v0.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w8, #3, #1 +; NO_SVE-NEXT: mov v1.b[1], w9 +; NO_SVE-NEXT: ubfx w9, w8, #10, #1 +; NO_SVE-NEXT: mov v0.b[2], w10 +; NO_SVE-NEXT: ubfx w10, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #11, #1 +; NO_SVE-NEXT: mov v0.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #12, #1 +; NO_SVE-NEXT: mov v0.b[4], w10 +; NO_SVE-NEXT: ubfx w10, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #13, #1 +; NO_SVE-NEXT: mov v0.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w8, #7, #1 +; NO_SVE-NEXT: mov v1.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #14, #1 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v0.b[6], w10 +; NO_SVE-NEXT: mov v1.b[6], w9 +; NO_SVE-NEXT: mov v0.b[7], w11 +; NO_SVE-NEXT: mov v1.b[7], w8 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: shl v0.8h, v0.8h, #15 +; NO_SVE-NEXT: shl v1.8h, v1.8h, #15 +; NO_SVE-NEXT: cmlt v0.8h, v0.8h, #0 +; NO_SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NO_SVE-NEXT: bsl v0.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bsl v1.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1186,8 +2349,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrh w8, [x2] ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p1.h @@ -1246,6 +2409,98 @@ } define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w9, [x2] +; NO_SVE-NEXT: ldrh w8, [x2, #2] +; NO_SVE-NEXT: ldp q4, q5, [x0, #32] +; NO_SVE-NEXT: ubfx w10, w9, #8, #1 +; NO_SVE-NEXT: and w12, w9, #0x1 +; NO_SVE-NEXT: ubfx w11, w8, #8, #1 +; NO_SVE-NEXT: fmov s0, w10 +; NO_SVE-NEXT: and w10, w8, #0x1 +; NO_SVE-NEXT: fmov s1, w12 +; NO_SVE-NEXT: ubfx w12, w9, #9, #1 +; NO_SVE-NEXT: fmov s2, w11 +; NO_SVE-NEXT: ubfx w11, w9, #1, #1 +; NO_SVE-NEXT: fmov s3, w10 +; NO_SVE-NEXT: ubfx w10, w8, #9, #1 +; NO_SVE-NEXT: mov v0.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w8, #1, #1 +; NO_SVE-NEXT: mov v1.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w9, #10, #1 +; NO_SVE-NEXT: mov v2.b[1], w10 +; NO_SVE-NEXT: ubfx w10, w9, #2, #1 +; NO_SVE-NEXT: mov v3.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w8, #10, #1 +; NO_SVE-NEXT: mov v0.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w8, #2, #1 +; NO_SVE-NEXT: mov v1.b[2], w10 +; NO_SVE-NEXT: ubfx w10, w9, #11, #1 +; NO_SVE-NEXT: mov v2.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #3, #1 +; NO_SVE-NEXT: mov v3.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w8, #11, #1 +; NO_SVE-NEXT: mov v0.b[3], w10 +; NO_SVE-NEXT: ubfx w10, w8, #3, #1 +; NO_SVE-NEXT: mov v1.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w9, #12, #1 +; NO_SVE-NEXT: mov v2.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #4, #1 +; NO_SVE-NEXT: mov v3.b[3], w10 +; NO_SVE-NEXT: ubfx w10, w8, #12, #1 +; NO_SVE-NEXT: mov v0.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v2.b[4], w10 +; NO_SVE-NEXT: ubfx w10, w9, #5, #1 +; NO_SVE-NEXT: mov v3.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w8, #13, #1 +; NO_SVE-NEXT: mov v0.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w10 +; NO_SVE-NEXT: ubfx w10, w9, #14, #1 +; NO_SVE-NEXT: mov v2.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w9, #6, #1 +; NO_SVE-NEXT: mov v3.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w8, #14, #1 +; NO_SVE-NEXT: mov v0.b[6], w10 +; NO_SVE-NEXT: ubfx w10, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w12 +; NO_SVE-NEXT: lsr w12, w9, #15 +; NO_SVE-NEXT: mov v2.b[6], w11 +; NO_SVE-NEXT: ubfx w9, w9, #7, #1 +; NO_SVE-NEXT: mov v3.b[6], w10 +; NO_SVE-NEXT: lsr w10, w8, #15 +; NO_SVE-NEXT: ubfx w8, w8, #7, #1 +; NO_SVE-NEXT: ldp q6, q7, [x1, #32] +; NO_SVE-NEXT: mov v2.b[7], w10 +; NO_SVE-NEXT: mov v3.b[7], w8 +; NO_SVE-NEXT: mov v0.b[7], w12 +; NO_SVE-NEXT: mov v1.b[7], w9 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: ushll v3.8h, v3.8b, #0 +; NO_SVE-NEXT: shl v2.8h, v2.8h, #15 +; NO_SVE-NEXT: shl v3.8h, v3.8h, #15 +; NO_SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NO_SVE-NEXT: cmlt v3.8h, v3.8h, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: bsl v2.16b, v5.16b, v7.16b +; NO_SVE-NEXT: ldp q5, q7, [x1] +; NO_SVE-NEXT: bsl v3.16b, v4.16b, v6.16b +; NO_SVE-NEXT: shl v1.8h, v1.8h, #15 +; NO_SVE-NEXT: shl v0.8h, v0.8h, #15 +; NO_SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NO_SVE-NEXT: cmlt v0.8h, v0.8h, #0 +; NO_SVE-NEXT: stp q3, q2, [x0, #32] +; NO_SVE-NEXT: ldp q4, q6, [x0] +; NO_SVE-NEXT: bsl v1.16b, v4.16b, v5.16b +; NO_SVE-NEXT: bsl v0.16b, v6.16b, v7.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1254,8 +2509,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldr w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ptrue p1.h @@ -1346,6 +2601,187 @@ } define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, <64 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w10, [x2] +; NO_SVE-NEXT: ldrh w9, [x2, #2] +; NO_SVE-NEXT: ldrh w8, [x2, #4] +; NO_SVE-NEXT: and w12, w10, #0x1 +; NO_SVE-NEXT: ubfx w14, w10, #8, #1 +; NO_SVE-NEXT: and w11, w9, #0x1 +; NO_SVE-NEXT: ubfx w13, w9, #8, #1 +; NO_SVE-NEXT: ldp q19, q18, [x1, #64] +; NO_SVE-NEXT: fmov s0, w12 +; NO_SVE-NEXT: and w12, w8, #0x1 +; NO_SVE-NEXT: fmov s2, w11 +; NO_SVE-NEXT: ubfx w11, w8, #8, #1 +; NO_SVE-NEXT: fmov s3, w13 +; NO_SVE-NEXT: ubfx w13, w10, #1, #1 +; NO_SVE-NEXT: fmov s1, w14 +; NO_SVE-NEXT: fmov s4, w12 +; NO_SVE-NEXT: ubfx w12, w10, #9, #1 +; NO_SVE-NEXT: fmov s5, w11 +; NO_SVE-NEXT: ubfx w11, w9, #1, #1 +; NO_SVE-NEXT: ldr q24, [x0, #64] +; NO_SVE-NEXT: mov v0.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w9, #9, #1 +; NO_SVE-NEXT: mov v1.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w10, #2, #1 +; NO_SVE-NEXT: mov v2.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w10, #10, #1 +; NO_SVE-NEXT: mov v3.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w9, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #10, #1 +; NO_SVE-NEXT: mov v1.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w10, #3, #1 +; NO_SVE-NEXT: mov v2.b[2], w13 +; NO_SVE-NEXT: ubfx w13, w10, #11, #1 +; NO_SVE-NEXT: mov v3.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #3, #1 +; NO_SVE-NEXT: mov v0.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v1.b[3], w13 +; NO_SVE-NEXT: ubfx w13, w10, #4, #1 +; NO_SVE-NEXT: mov v2.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w10, #12, #1 +; NO_SVE-NEXT: mov v3.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #4, #1 +; NO_SVE-NEXT: mov v0.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w9, #12, #1 +; NO_SVE-NEXT: mov v1.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w10, #5, #1 +; NO_SVE-NEXT: mov v2.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w10, #13, #1 +; NO_SVE-NEXT: mov v3.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w9, #5, #1 +; NO_SVE-NEXT: mov v0.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w9, #13, #1 +; NO_SVE-NEXT: mov v1.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w10, #6, #1 +; NO_SVE-NEXT: mov v2.b[5], w13 +; NO_SVE-NEXT: ubfx w13, w10, #14, #1 +; NO_SVE-NEXT: mov v3.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w9, #6, #1 +; NO_SVE-NEXT: mov v0.b[6], w11 +; NO_SVE-NEXT: ldrh w11, [x2, #6] +; NO_SVE-NEXT: mov v1.b[6], w13 +; NO_SVE-NEXT: ubfx w13, w9, #14, #1 +; NO_SVE-NEXT: mov v2.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w10, #7, #1 +; NO_SVE-NEXT: lsr w10, w10, #15 +; NO_SVE-NEXT: mov v3.b[6], w13 +; NO_SVE-NEXT: ubfx w13, w8, #1, #1 +; NO_SVE-NEXT: mov v0.b[7], w12 +; NO_SVE-NEXT: ubfx w12, w9, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w10 +; NO_SVE-NEXT: ubfx w10, w8, #9, #1 +; NO_SVE-NEXT: mov v4.b[1], w13 +; NO_SVE-NEXT: lsr w9, w9, #15 +; NO_SVE-NEXT: mov v2.b[7], w12 +; NO_SVE-NEXT: ubfx w12, w8, #2, #1 +; NO_SVE-NEXT: mov v5.b[1], w10 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: mov v3.b[7], w9 +; NO_SVE-NEXT: ubfx w9, w8, #10, #1 +; NO_SVE-NEXT: mov v4.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w11, #8, #1 +; NO_SVE-NEXT: fmov s6, w10 +; NO_SVE-NEXT: ubfx w10, w11, #1, #1 +; NO_SVE-NEXT: mov v5.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #3, #1 +; NO_SVE-NEXT: fmov s7, w12 +; NO_SVE-NEXT: ubfx w12, w11, #9, #1 +; NO_SVE-NEXT: mov v6.b[1], w10 +; NO_SVE-NEXT: ubfx w10, w8, #11, #1 +; NO_SVE-NEXT: mov v4.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w11, #2, #1 +; NO_SVE-NEXT: mov v7.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w8, #4, #1 +; NO_SVE-NEXT: mov v5.b[3], w10 +; NO_SVE-NEXT: ubfx w10, w11, #10, #1 +; NO_SVE-NEXT: mov v6.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #12, #1 +; NO_SVE-NEXT: mov v4.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w11, #3, #1 +; NO_SVE-NEXT: mov v7.b[2], w10 +; NO_SVE-NEXT: ubfx w10, w8, #5, #1 +; NO_SVE-NEXT: mov v5.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w11, #11, #1 +; NO_SVE-NEXT: mov v6.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w8, #13, #1 +; NO_SVE-NEXT: mov v4.b[5], w10 +; NO_SVE-NEXT: ubfx w10, w11, #4, #1 +; NO_SVE-NEXT: mov v7.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #6, #1 +; NO_SVE-NEXT: mov v5.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w11, #12, #1 +; NO_SVE-NEXT: mov v6.b[4], w10 +; NO_SVE-NEXT: ubfx w10, w8, #14, #1 +; NO_SVE-NEXT: mov v4.b[6], w9 +; NO_SVE-NEXT: ubfx w9, w11, #5, #1 +; NO_SVE-NEXT: mov v7.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w8, #7, #1 +; NO_SVE-NEXT: mov v5.b[6], w10 +; NO_SVE-NEXT: ubfx w10, w11, #13, #1 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v4.b[7], w12 +; NO_SVE-NEXT: mov v6.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w11, #6, #1 +; NO_SVE-NEXT: mov v7.b[5], w10 +; NO_SVE-NEXT: mov v5.b[7], w8 +; NO_SVE-NEXT: ubfx w8, w11, #14, #1 +; NO_SVE-NEXT: ushll v4.8h, v4.8b, #0 +; NO_SVE-NEXT: mov v6.b[6], w9 +; NO_SVE-NEXT: ubfx w9, w11, #7, #1 +; NO_SVE-NEXT: mov v7.b[6], w8 +; NO_SVE-NEXT: lsr w8, w11, #15 +; NO_SVE-NEXT: shl v4.8h, v4.8h, #15 +; NO_SVE-NEXT: ushll v5.8h, v5.8b, #0 +; NO_SVE-NEXT: cmlt v4.8h, v4.8h, #0 +; NO_SVE-NEXT: mov v6.b[7], w9 +; NO_SVE-NEXT: mov v7.b[7], w8 +; NO_SVE-NEXT: shl v5.8h, v5.8h, #15 +; NO_SVE-NEXT: bsl v4.16b, v24.16b, v19.16b +; NO_SVE-NEXT: ldr q19, [x0, #80] +; NO_SVE-NEXT: cmlt v5.8h, v5.8h, #0 +; NO_SVE-NEXT: ldp q17, q16, [x1, #96] +; NO_SVE-NEXT: ushll v6.8h, v6.8b, #0 +; NO_SVE-NEXT: bsl v5.16b, v19.16b, v18.16b +; NO_SVE-NEXT: ushll v7.8h, v7.8b, #0 +; NO_SVE-NEXT: shl v6.8h, v6.8h, #15 +; NO_SVE-NEXT: shl v7.8h, v7.8h, #15 +; NO_SVE-NEXT: ldp q18, q19, [x0, #96] +; NO_SVE-NEXT: cmlt v6.8h, v6.8h, #0 +; NO_SVE-NEXT: cmlt v7.8h, v7.8h, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: ldp q21, q20, [x1, #32] +; NO_SVE-NEXT: ushll v3.8h, v3.8b, #0 +; NO_SVE-NEXT: shl v0.8h, v0.8h, #15 +; NO_SVE-NEXT: bsl v6.16b, v18.16b, v17.16b +; NO_SVE-NEXT: bsl v7.16b, v19.16b, v16.16b +; NO_SVE-NEXT: shl v1.8h, v1.8h, #15 +; NO_SVE-NEXT: shl v3.8h, v3.8h, #15 +; NO_SVE-NEXT: shl v2.8h, v2.8h, #15 +; NO_SVE-NEXT: ldp q23, q22, [x1] +; NO_SVE-NEXT: cmlt v0.8h, v0.8h, #0 +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: cmlt v3.8h, v3.8h, #0 +; NO_SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NO_SVE-NEXT: ldp q18, q17, [x0, #32] +; NO_SVE-NEXT: bsl v2.16b, v18.16b, v21.16b +; NO_SVE-NEXT: ldp q19, q16, [x0] +; NO_SVE-NEXT: bsl v3.16b, v17.16b, v20.16b +; NO_SVE-NEXT: bsl v0.16b, v19.16b, v23.16b +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: bsl v1.16b, v16.16b, v22.16b +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1354,8 +2790,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr x8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ptrue p1.h @@ -1510,6 +2946,365 @@ } define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, <128 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v128i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w10, [x2, #2] +; NO_SVE-NEXT: ldrh w9, [x2, #4] +; NO_SVE-NEXT: ldrh w8, [x2, #8] +; NO_SVE-NEXT: and w13, w10, #0x1 +; NO_SVE-NEXT: ubfx w12, w10, #8, #1 +; NO_SVE-NEXT: and w11, w9, #0x1 +; NO_SVE-NEXT: ubfx w14, w10, #7, #1 +; NO_SVE-NEXT: ldp q23, q22, [x0, #32] +; NO_SVE-NEXT: fmov s0, w13 +; NO_SVE-NEXT: ubfx w13, w9, #8, #1 +; NO_SVE-NEXT: fmov s1, w12 +; NO_SVE-NEXT: and w12, w8, #0x1 +; NO_SVE-NEXT: fmov s2, w11 +; NO_SVE-NEXT: ubfx w11, w10, #1, #1 +; NO_SVE-NEXT: fmov s3, w13 +; NO_SVE-NEXT: ubfx w13, w10, #9, #1 +; NO_SVE-NEXT: fmov s4, w12 +; NO_SVE-NEXT: ubfx w12, w9, #1, #1 +; NO_SVE-NEXT: mov v0.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w9, #9, #1 +; NO_SVE-NEXT: mov v1.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w10, #2, #1 +; NO_SVE-NEXT: mov v2.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w10, #10, #1 +; NO_SVE-NEXT: mov v3.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w9, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w13 +; NO_SVE-NEXT: ubfx w13, w8, #1, #1 +; NO_SVE-NEXT: mov v1.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w10, #3, #1 +; NO_SVE-NEXT: mov v2.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w10, #11, #1 +; NO_SVE-NEXT: mov v4.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w9, #3, #1 +; NO_SVE-NEXT: mov v0.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w9, #10, #1 +; NO_SVE-NEXT: mov v1.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w10, #4, #1 +; NO_SVE-NEXT: mov v2.b[3], w13 +; NO_SVE-NEXT: ubfx w13, w10, #12, #1 +; NO_SVE-NEXT: mov v3.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #4, #1 +; NO_SVE-NEXT: mov v0.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v1.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w10, #5, #1 +; NO_SVE-NEXT: mov v2.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w10, #13, #1 +; NO_SVE-NEXT: mov v3.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #5, #1 +; NO_SVE-NEXT: mov v0.b[5], w13 +; NO_SVE-NEXT: ubfx w13, w9, #12, #1 +; NO_SVE-NEXT: mov v1.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w10, #6, #1 +; NO_SVE-NEXT: mov v2.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w10, #14, #1 +; NO_SVE-NEXT: mov v3.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w9, #6, #1 +; NO_SVE-NEXT: mov v0.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w9, #13, #1 +; NO_SVE-NEXT: mov v1.b[6], w11 +; NO_SVE-NEXT: lsr w10, w10, #15 +; NO_SVE-NEXT: mov v2.b[6], w13 +; NO_SVE-NEXT: ldrh w11, [x2, #10] +; NO_SVE-NEXT: mov v3.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w9, #7, #1 +; NO_SVE-NEXT: ubfx w13, w8, #8, #1 +; NO_SVE-NEXT: mov v1.b[7], w10 +; NO_SVE-NEXT: ubfx w10, w9, #14, #1 +; NO_SVE-NEXT: mov v2.b[7], w12 +; NO_SVE-NEXT: ubfx w12, w8, #2, #1 +; NO_SVE-NEXT: fmov s5, w13 +; NO_SVE-NEXT: ubfx w13, w8, #9, #1 +; NO_SVE-NEXT: mov v3.b[6], w10 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: mov v4.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w8, #3, #1 +; NO_SVE-NEXT: mov v5.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w11, #1, #1 +; NO_SVE-NEXT: fmov s6, w10 +; NO_SVE-NEXT: lsr w9, w9, #15 +; NO_SVE-NEXT: mov v0.b[7], w14 +; NO_SVE-NEXT: ubfx w14, w11, #12, #1 +; NO_SVE-NEXT: mov v4.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w8, #10, #1 +; NO_SVE-NEXT: mov v6.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w8, #4, #1 +; NO_SVE-NEXT: mov v3.b[7], w9 +; NO_SVE-NEXT: ubfx w9, w11, #2, #1 +; NO_SVE-NEXT: mov v5.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w11, #8, #1 +; NO_SVE-NEXT: mov v4.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w8, #11, #1 +; NO_SVE-NEXT: mov v6.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #5, #1 +; NO_SVE-NEXT: fmov s7, w12 +; NO_SVE-NEXT: ubfx w12, w11, #9, #1 +; NO_SVE-NEXT: mov v5.b[3], w13 +; NO_SVE-NEXT: ubfx w13, w11, #3, #1 +; NO_SVE-NEXT: mov v4.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #12, #1 +; NO_SVE-NEXT: mov v7.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w8, #6, #1 +; NO_SVE-NEXT: mov v6.b[3], w13 +; NO_SVE-NEXT: ubfx w13, w11, #10, #1 +; NO_SVE-NEXT: mov v5.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w11, #4, #1 +; NO_SVE-NEXT: mov v4.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w8, #13, #1 +; NO_SVE-NEXT: mov v7.b[2], w13 +; NO_SVE-NEXT: ubfx w13, w8, #7, #1 +; NO_SVE-NEXT: mov v6.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w11, #11, #1 +; NO_SVE-NEXT: mov v5.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w11, #5, #1 +; NO_SVE-NEXT: mov v4.b[7], w13 +; NO_SVE-NEXT: ubfx w13, w8, #14, #1 +; NO_SVE-NEXT: mov v7.b[3], w9 +; NO_SVE-NEXT: ldrh w10, [x2, #12] +; NO_SVE-NEXT: mov v6.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w11, #6, #1 +; NO_SVE-NEXT: mov v5.b[6], w13 +; NO_SVE-NEXT: ldrh w9, [x2, #14] +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: and w13, w10, #0x1 +; NO_SVE-NEXT: mov v7.b[4], w14 +; NO_SVE-NEXT: ubfx w14, w11, #7, #1 +; NO_SVE-NEXT: mov v6.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w11, #13, #1 +; NO_SVE-NEXT: mov v5.b[7], w8 +; NO_SVE-NEXT: and w8, w9, #0x1 +; NO_SVE-NEXT: fmov s16, w13 +; NO_SVE-NEXT: ubfx w13, w10, #1, #1 +; NO_SVE-NEXT: mov v7.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w11, #14, #1 +; NO_SVE-NEXT: fmov s18, w8 +; NO_SVE-NEXT: ubfx w8, w9, #1, #1 +; NO_SVE-NEXT: mov v16.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w10, #8, #1 +; NO_SVE-NEXT: lsr w11, w11, #15 +; NO_SVE-NEXT: mov v7.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w10, #2, #1 +; NO_SVE-NEXT: fmov s17, w13 +; NO_SVE-NEXT: mov v18.b[1], w8 +; NO_SVE-NEXT: ubfx w8, w10, #9, #1 +; NO_SVE-NEXT: mov v16.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #2, #1 +; NO_SVE-NEXT: mov v7.b[7], w11 +; NO_SVE-NEXT: ubfx w11, w10, #3, #1 +; NO_SVE-NEXT: mov v17.b[1], w8 +; NO_SVE-NEXT: ubfx w8, w9, #8, #1 +; NO_SVE-NEXT: mov v18.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w10, #10, #1 +; NO_SVE-NEXT: mov v16.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #3, #1 +; NO_SVE-NEXT: fmov s19, w8 +; NO_SVE-NEXT: ubfx w8, w9, #9, #1 +; NO_SVE-NEXT: mov v17.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w10, #4, #1 +; NO_SVE-NEXT: mov v18.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w10, #11, #1 +; NO_SVE-NEXT: mov v19.b[1], w8 +; NO_SVE-NEXT: ubfx w8, w9, #4, #1 +; NO_SVE-NEXT: mov v16.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w9, #10, #1 +; NO_SVE-NEXT: mov v17.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w10, #5, #1 +; NO_SVE-NEXT: mov v18.b[4], w8 +; NO_SVE-NEXT: ubfx w8, w10, #12, #1 +; NO_SVE-NEXT: mov v19.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #5, #1 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ldp q25, q24, [x1, #32] +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: shl v0.8h, v0.8h, #15 +; NO_SVE-NEXT: mov v16.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v17.b[4], w8 +; NO_SVE-NEXT: ubfx w8, w10, #6, #1 +; NO_SVE-NEXT: shl v1.8h, v1.8h, #15 +; NO_SVE-NEXT: mov v18.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w10, #13, #1 +; NO_SVE-NEXT: mov v19.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #6, #1 +; NO_SVE-NEXT: cmlt v0.8h, v0.8h, #0 +; NO_SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NO_SVE-NEXT: mov v16.b[6], w8 +; NO_SVE-NEXT: ubfx w8, w9, #12, #1 +; NO_SVE-NEXT: mov v17.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w10, #7, #1 +; NO_SVE-NEXT: bsl v0.16b, v23.16b, v25.16b +; NO_SVE-NEXT: ldp q25, q23, [x0, #64] +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: bsl v1.16b, v22.16b, v24.16b +; NO_SVE-NEXT: ushll v3.8h, v3.8b, #0 +; NO_SVE-NEXT: mov v19.b[4], w8 +; NO_SVE-NEXT: ubfx w8, w9, #7, #1 +; NO_SVE-NEXT: shl v2.8h, v2.8h, #15 +; NO_SVE-NEXT: ldp q22, q26, [x1, #64] +; NO_SVE-NEXT: shl v3.8h, v3.8h, #15 +; NO_SVE-NEXT: mov v18.b[6], w11 +; NO_SVE-NEXT: ubfx w11, w10, #14, #1 +; NO_SVE-NEXT: mov v16.b[7], w12 +; NO_SVE-NEXT: ubfx w12, w9, #13, #1 +; NO_SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; NO_SVE-NEXT: lsr w10, w10, #15 +; NO_SVE-NEXT: cmlt v3.8h, v3.8h, #0 +; NO_SVE-NEXT: mov v19.b[5], w12 +; NO_SVE-NEXT: bsl v2.16b, v25.16b, v22.16b +; NO_SVE-NEXT: ldp q24, q22, [x0, #128] +; NO_SVE-NEXT: ushll v4.8h, v4.8b, #0 +; NO_SVE-NEXT: bsl v3.16b, v23.16b, v26.16b +; NO_SVE-NEXT: ushll v5.8h, v5.8b, #0 +; NO_SVE-NEXT: mov v18.b[7], w8 +; NO_SVE-NEXT: lsr w8, w9, #15 +; NO_SVE-NEXT: ubfx w9, w9, #14, #1 +; NO_SVE-NEXT: shl v4.8h, v4.8h, #15 +; NO_SVE-NEXT: ldp q23, q25, [x1, #128] +; NO_SVE-NEXT: mov v6.b[7], w14 +; NO_SVE-NEXT: shl v5.8h, v5.8h, #15 +; NO_SVE-NEXT: cmlt v4.8h, v4.8h, #0 +; NO_SVE-NEXT: cmlt v5.8h, v5.8h, #0 +; NO_SVE-NEXT: mov v19.b[6], w9 +; NO_SVE-NEXT: mov v17.b[6], w11 +; NO_SVE-NEXT: bsl v4.16b, v24.16b, v23.16b +; NO_SVE-NEXT: ldp q24, q23, [x0, #160] +; NO_SVE-NEXT: ushll v6.8h, v6.8b, #0 +; NO_SVE-NEXT: bsl v5.16b, v22.16b, v25.16b +; NO_SVE-NEXT: ushll v7.8h, v7.8b, #0 +; NO_SVE-NEXT: shl v6.8h, v6.8h, #15 +; NO_SVE-NEXT: shl v7.8h, v7.8h, #15 +; NO_SVE-NEXT: ldp q22, q26, [x1, #160] +; NO_SVE-NEXT: mov v19.b[7], w8 +; NO_SVE-NEXT: mov v17.b[7], w10 +; NO_SVE-NEXT: cmlt v6.8h, v6.8h, #0 +; NO_SVE-NEXT: ushll v16.8h, v16.8b, #0 +; NO_SVE-NEXT: cmlt v7.8h, v7.8h, #0 +; NO_SVE-NEXT: ldp q21, q20, [x0, #224] +; NO_SVE-NEXT: bsl v6.16b, v24.16b, v22.16b +; NO_SVE-NEXT: shl v16.8h, v16.8h, #15 +; NO_SVE-NEXT: bsl v7.16b, v23.16b, v26.16b +; NO_SVE-NEXT: ushll v18.8h, v18.8b, #0 +; NO_SVE-NEXT: ushll v19.8h, v19.8b, #0 +; NO_SVE-NEXT: ushll v17.8h, v17.8b, #0 +; NO_SVE-NEXT: ldp q24, q22, [x0, #192] +; NO_SVE-NEXT: cmlt v16.8h, v16.8h, #0 +; NO_SVE-NEXT: shl v18.8h, v18.8h, #15 +; NO_SVE-NEXT: shl v19.8h, v19.8h, #15 +; NO_SVE-NEXT: shl v17.8h, v17.8h, #15 +; NO_SVE-NEXT: cmlt v18.8h, v18.8h, #0 +; NO_SVE-NEXT: cmlt v19.8h, v19.8h, #0 +; NO_SVE-NEXT: ldp q23, q25, [x1, #224] +; NO_SVE-NEXT: cmlt v17.8h, v17.8h, #0 +; NO_SVE-NEXT: bsl v18.16b, v21.16b, v23.16b +; NO_SVE-NEXT: ldr q26, [x1, #192] +; NO_SVE-NEXT: ldrh w8, [x2, #6] +; NO_SVE-NEXT: ldrh w10, [x2] +; NO_SVE-NEXT: bsl v16.16b, v24.16b, v26.16b +; NO_SVE-NEXT: ldr q24, [x1, #208] +; NO_SVE-NEXT: bsl v19.16b, v20.16b, v25.16b +; NO_SVE-NEXT: and w12, w8, #0x1 +; NO_SVE-NEXT: ubfx w9, w8, #8, #1 +; NO_SVE-NEXT: and w11, w10, #0x1 +; NO_SVE-NEXT: bsl v17.16b, v22.16b, v24.16b +; NO_SVE-NEXT: ldp q21, q20, [x1, #96] +; NO_SVE-NEXT: ldp q23, q22, [x1] +; NO_SVE-NEXT: stp q0, q1, [x0, #32] +; NO_SVE-NEXT: stp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q4, q5, [x0, #128] +; NO_SVE-NEXT: stp q6, q7, [x0, #160] +; NO_SVE-NEXT: stp q16, q17, [x0, #192] +; NO_SVE-NEXT: fmov s17, w9 +; NO_SVE-NEXT: stp q18, q19, [x0, #224] +; NO_SVE-NEXT: fmov s18, w12 +; NO_SVE-NEXT: ubfx w12, w10, #8, #1 +; NO_SVE-NEXT: fmov s19, w11 +; NO_SVE-NEXT: ubfx w11, w8, #9, #1 +; NO_SVE-NEXT: ubfx w9, w8, #1, #1 +; NO_SVE-NEXT: ldp q7, q6, [x0, #96] +; NO_SVE-NEXT: fmov s16, w12 +; NO_SVE-NEXT: ubfx w12, w10, #1, #1 +; NO_SVE-NEXT: mov v17.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w10, #9, #1 +; NO_SVE-NEXT: mov v18.b[1], w9 +; NO_SVE-NEXT: ubfx w9, w8, #10, #1 +; NO_SVE-NEXT: mov v19.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w8, #2, #1 +; NO_SVE-NEXT: mov v16.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w10, #2, #1 +; NO_SVE-NEXT: mov v17.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w10, #10, #1 +; NO_SVE-NEXT: mov v18.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w8, #11, #1 +; NO_SVE-NEXT: mov v19.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w8, #3, #1 +; NO_SVE-NEXT: mov v16.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w10, #3, #1 +; NO_SVE-NEXT: mov v17.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w10, #11, #1 +; NO_SVE-NEXT: mov v18.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w8, #12, #1 +; NO_SVE-NEXT: mov v19.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #4, #1 +; NO_SVE-NEXT: mov v16.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w10, #4, #1 +; NO_SVE-NEXT: mov v17.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w10, #12, #1 +; NO_SVE-NEXT: mov v18.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #13, #1 +; NO_SVE-NEXT: mov v19.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w8, #5, #1 +; NO_SVE-NEXT: mov v16.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w10, #5, #1 +; NO_SVE-NEXT: mov v17.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w10, #13, #1 +; NO_SVE-NEXT: mov v18.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w8, #14, #1 +; NO_SVE-NEXT: mov v19.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w8, #6, #1 +; NO_SVE-NEXT: mov v16.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w10, #6, #1 +; NO_SVE-NEXT: mov v17.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w10, #14, #1 +; NO_SVE-NEXT: mov v18.b[6], w11 +; NO_SVE-NEXT: lsr w11, w8, #15 +; NO_SVE-NEXT: ubfx w8, w8, #7, #1 +; NO_SVE-NEXT: mov v19.b[6], w9 +; NO_SVE-NEXT: ubfx w9, w10, #7, #1 +; NO_SVE-NEXT: mov v17.b[7], w11 +; NO_SVE-NEXT: lsr w10, w10, #15 +; NO_SVE-NEXT: mov v18.b[7], w8 +; NO_SVE-NEXT: mov v16.b[6], w12 +; NO_SVE-NEXT: mov v19.b[7], w9 +; NO_SVE-NEXT: ushll v17.8h, v17.8b, #0 +; NO_SVE-NEXT: ushll v18.8h, v18.8b, #0 +; NO_SVE-NEXT: shl v17.8h, v17.8h, #15 +; NO_SVE-NEXT: shl v18.8h, v18.8h, #15 +; NO_SVE-NEXT: cmlt v17.8h, v17.8h, #0 +; NO_SVE-NEXT: cmlt v18.8h, v18.8h, #0 +; NO_SVE-NEXT: mov v16.b[7], w10 +; NO_SVE-NEXT: mov v5.16b, v18.16b +; NO_SVE-NEXT: bif v6.16b, v20.16b, v17.16b +; NO_SVE-NEXT: bsl v5.16b, v7.16b, v21.16b +; NO_SVE-NEXT: ushll v7.8h, v19.8b, #0 +; NO_SVE-NEXT: ushll v4.8h, v16.8b, #0 +; NO_SVE-NEXT: shl v7.8h, v7.8h, #15 +; NO_SVE-NEXT: stp q5, q6, [x0, #96] +; NO_SVE-NEXT: shl v4.8h, v4.8h, #15 +; NO_SVE-NEXT: ldp q6, q5, [x0] +; NO_SVE-NEXT: cmlt v7.8h, v7.8h, #0 +; NO_SVE-NEXT: cmlt v4.8h, v4.8h, #0 +; NO_SVE-NEXT: mov v3.16b, v7.16b +; NO_SVE-NEXT: bsl v3.16b, v6.16b, v23.16b +; NO_SVE-NEXT: bsl v4.16b, v5.16b, v22.16b +; NO_SVE-NEXT: stp q3, q4, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1518,8 +3313,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2, #8] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ptrue p1.h @@ -1804,6 +3599,13 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: shl v2.2s, v2.2s, #31 +; NO_SVE-NEXT: cmlt v2.2s, v2.2s, #0 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v2.2s, v2.2s, #31 @@ -1816,6 +3618,14 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: shl v2.4s, v2.4s, #31 +; NO_SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 @@ -1828,6 +3638,40 @@ } define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, <8 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrb w8, [x2] +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: ubfx w9, w8, #1, #1 +; NO_SVE-NEXT: ldp q4, q5, [x1] +; NO_SVE-NEXT: mov v0.b[1], w9 +; NO_SVE-NEXT: ubfx w9, w8, #2, #1 +; NO_SVE-NEXT: mov v0.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #3, #1 +; NO_SVE-NEXT: mov v0.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #4, #1 +; NO_SVE-NEXT: mov v0.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #5, #1 +; NO_SVE-NEXT: mov v0.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #6, #1 +; NO_SVE-NEXT: lsr w8, w8, #7 +; NO_SVE-NEXT: mov v0.b[6], w9 +; NO_SVE-NEXT: mov v0.b[7], w8 +; NO_SVE-NEXT: zip1 v1.8b, v0.8b, v0.8b +; NO_SVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: shl v1.4s, v1.4s, #31 +; NO_SVE-NEXT: shl v0.4s, v0.4s, #31 +; NO_SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmlt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bsl v1.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bsl v0.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1836,8 +3680,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrb w8, [x2] ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p1.s @@ -1876,6 +3720,71 @@ } define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, <16 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldp q16, q17, [x1] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: ubfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s1, w9 +; NO_SVE-NEXT: ubfx w9, w8, #2, #1 +; NO_SVE-NEXT: ldp q3, q5, [x0, #16] +; NO_SVE-NEXT: mov v1.b[1], w10 +; NO_SVE-NEXT: mov v1.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #3, #1 +; NO_SVE-NEXT: ldr q6, [x0, #48] +; NO_SVE-NEXT: mov v1.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w9 +; NO_SVE-NEXT: ubfx w9, w8, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w9 +; NO_SVE-NEXT: ubfx w9, w8, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w9 +; NO_SVE-NEXT: ubfx w9, w8, #9, #1 +; NO_SVE-NEXT: mov v1.b[9], w9 +; NO_SVE-NEXT: ubfx w9, w8, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w9 +; NO_SVE-NEXT: ubfx w9, w8, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w9 +; NO_SVE-NEXT: ubfx w9, w8, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w9 +; NO_SVE-NEXT: ubfx w9, w8, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w9 +; NO_SVE-NEXT: ubfx w9, w8, #14, #1 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v1.b[14], w9 +; NO_SVE-NEXT: mov v1.b[15], w8 +; NO_SVE-NEXT: zip2 v4.8b, v1.8b, v0.8b +; NO_SVE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: zip1 v1.8b, v1.8b, v0.8b +; NO_SVE-NEXT: ushll v4.4s, v4.4h, #0 +; NO_SVE-NEXT: zip2 v7.8b, v2.8b, v0.8b +; NO_SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; NO_SVE-NEXT: shl v4.4s, v4.4s, #31 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: cmlt v4.4s, v4.4s, #0 +; NO_SVE-NEXT: shl v1.4s, v1.4s, #31 +; NO_SVE-NEXT: ushll v7.4s, v7.4h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: bif v3.16b, v17.16b, v4.16b +; NO_SVE-NEXT: ldp q4, q17, [x1, #32] +; NO_SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: shl v7.4s, v7.4s, #31 +; NO_SVE-NEXT: shl v2.4s, v2.4s, #31 +; NO_SVE-NEXT: bif v0.16b, v16.16b, v1.16b +; NO_SVE-NEXT: cmlt v1.4s, v2.4s, #0 +; NO_SVE-NEXT: cmlt v2.4s, v7.4s, #0 +; NO_SVE-NEXT: bsl v1.16b, v5.16b, v4.16b +; NO_SVE-NEXT: bsl v2.16b, v6.16b, v17.16b +; NO_SVE-NEXT: stp q0, q3, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1884,8 +3793,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldrh w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ptrue p1.s @@ -1936,6 +3845,133 @@ } define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldrh w9, [x2, #2] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: and w10, w8, #0x1 +; NO_SVE-NEXT: and w11, w9, #0x1 +; NO_SVE-NEXT: fmov s1, w10 +; NO_SVE-NEXT: ubfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s0, w11 +; NO_SVE-NEXT: ubfx w11, w9, #1, #1 +; NO_SVE-NEXT: ldp q25, q24, [x1] +; NO_SVE-NEXT: mov v1.b[1], w10 +; NO_SVE-NEXT: ubfx w10, w8, #2, #1 +; NO_SVE-NEXT: mov v0.b[1], w11 +; NO_SVE-NEXT: ubfx w11, w9, #2, #1 +; NO_SVE-NEXT: mov v1.b[2], w10 +; NO_SVE-NEXT: ubfx w10, w8, #3, #1 +; NO_SVE-NEXT: mov v0.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w9, #3, #1 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: mov v1.b[3], w10 +; NO_SVE-NEXT: ubfx w10, w8, #4, #1 +; NO_SVE-NEXT: mov v0.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w10 +; NO_SVE-NEXT: ubfx w10, w8, #5, #1 +; NO_SVE-NEXT: mov v0.b[4], w11 +; NO_SVE-NEXT: ubfx w11, w9, #5, #1 +; NO_SVE-NEXT: ldp q22, q21, [x1, #64] +; NO_SVE-NEXT: mov v1.b[5], w10 +; NO_SVE-NEXT: ubfx w10, w8, #6, #1 +; NO_SVE-NEXT: mov v0.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w9, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w10 +; NO_SVE-NEXT: ubfx w10, w8, #7, #1 +; NO_SVE-NEXT: mov v0.b[6], w11 +; NO_SVE-NEXT: ubfx w11, w9, #7, #1 +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: mov v1.b[7], w10 +; NO_SVE-NEXT: ubfx w10, w8, #8, #1 +; NO_SVE-NEXT: mov v0.b[7], w11 +; NO_SVE-NEXT: ubfx w11, w9, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w10 +; NO_SVE-NEXT: ubfx w10, w8, #9, #1 +; NO_SVE-NEXT: mov v0.b[8], w11 +; NO_SVE-NEXT: ubfx w11, w9, #9, #1 +; NO_SVE-NEXT: ldp q19, q18, [x0, #32] +; NO_SVE-NEXT: mov v1.b[9], w10 +; NO_SVE-NEXT: ubfx w10, w8, #10, #1 +; NO_SVE-NEXT: mov v0.b[9], w11 +; NO_SVE-NEXT: ubfx w11, w9, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w10 +; NO_SVE-NEXT: ubfx w10, w8, #11, #1 +; NO_SVE-NEXT: mov v0.b[10], w11 +; NO_SVE-NEXT: ubfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w10 +; NO_SVE-NEXT: ubfx w10, w8, #12, #1 +; NO_SVE-NEXT: mov v0.b[11], w11 +; NO_SVE-NEXT: ubfx w11, w9, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w10 +; NO_SVE-NEXT: ubfx w10, w8, #13, #1 +; NO_SVE-NEXT: mov v0.b[12], w11 +; NO_SVE-NEXT: ubfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w10 +; NO_SVE-NEXT: ubfx w10, w8, #14, #1 +; NO_SVE-NEXT: mov v0.b[13], w11 +; NO_SVE-NEXT: ubfx w11, w9, #14, #1 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: lsr w9, w9, #15 +; NO_SVE-NEXT: mov v1.b[14], w10 +; NO_SVE-NEXT: mov v0.b[14], w11 +; NO_SVE-NEXT: mov v1.b[15], w8 +; NO_SVE-NEXT: mov v0.b[15], w9 +; NO_SVE-NEXT: ext v16.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v17.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: zip1 v20.8b, v1.8b, v0.8b +; NO_SVE-NEXT: zip2 v1.8b, v1.8b, v0.8b +; NO_SVE-NEXT: zip1 v23.8b, v0.8b, v0.8b +; NO_SVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NO_SVE-NEXT: ushll v20.4s, v20.4h, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v23.4s, v23.4h, #0 +; NO_SVE-NEXT: zip1 v26.8b, v16.8b, v0.8b +; NO_SVE-NEXT: zip2 v16.8b, v16.8b, v0.8b +; NO_SVE-NEXT: zip1 v27.8b, v17.8b, v0.8b +; NO_SVE-NEXT: shl v20.4s, v20.4s, #31 +; NO_SVE-NEXT: zip2 v17.8b, v17.8b, v0.8b +; NO_SVE-NEXT: shl v1.4s, v1.4s, #31 +; NO_SVE-NEXT: shl v23.4s, v23.4s, #31 +; NO_SVE-NEXT: shl v0.4s, v0.4s, #31 +; NO_SVE-NEXT: cmlt v20.4s, v20.4s, #0 +; NO_SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: ushll v26.4s, v26.4h, #0 +; NO_SVE-NEXT: ushll v16.4s, v16.4h, #0 +; NO_SVE-NEXT: ushll v27.4s, v27.4h, #0 +; NO_SVE-NEXT: cmlt v23.4s, v23.4s, #0 +; NO_SVE-NEXT: cmlt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bif v7.16b, v25.16b, v20.16b +; NO_SVE-NEXT: ldp q25, q20, [x1, #96] +; NO_SVE-NEXT: ushll v17.4s, v17.4h, #0 +; NO_SVE-NEXT: shl v26.4s, v26.4s, #31 +; NO_SVE-NEXT: bsl v1.16b, v6.16b, v24.16b +; NO_SVE-NEXT: shl v16.4s, v16.4s, #31 +; NO_SVE-NEXT: shl v27.4s, v27.4s, #31 +; NO_SVE-NEXT: shl v17.4s, v17.4s, #31 +; NO_SVE-NEXT: bif v3.16b, v22.16b, v23.16b +; NO_SVE-NEXT: ldp q24, q6, [x1, #32] +; NO_SVE-NEXT: bsl v0.16b, v2.16b, v21.16b +; NO_SVE-NEXT: stp q7, q1, [x0] +; NO_SVE-NEXT: cmlt v2.4s, v26.4s, #0 +; NO_SVE-NEXT: cmlt v16.4s, v16.4s, #0 +; NO_SVE-NEXT: cmlt v21.4s, v27.4s, #0 +; NO_SVE-NEXT: cmlt v17.4s, v17.4s, #0 +; NO_SVE-NEXT: stp q3, q0, [x0, #64] +; NO_SVE-NEXT: mov v0.16b, v21.16b +; NO_SVE-NEXT: mov v3.16b, v16.16b +; NO_SVE-NEXT: mov v1.16b, v2.16b +; NO_SVE-NEXT: bif v5.16b, v20.16b, v17.16b +; NO_SVE-NEXT: bsl v0.16b, v4.16b, v25.16b +; NO_SVE-NEXT: bsl v3.16b, v18.16b, v6.16b +; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b +; NO_SVE-NEXT: stp q0, q5, [x0, #96] +; NO_SVE-NEXT: stp q1, q3, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -1944,8 +3980,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr w8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ptrue p1.s @@ -2020,6 +4056,253 @@ } define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2, #6] +; NO_SVE-NEXT: ldrh w9, [x2, #4] +; NO_SVE-NEXT: ldrh w10, [x2, #2] +; NO_SVE-NEXT: ldrh w11, [x2] +; NO_SVE-NEXT: and w12, w8, #0x1 +; NO_SVE-NEXT: and w13, w9, #0x1 +; NO_SVE-NEXT: ldr q21, [x1] +; NO_SVE-NEXT: and w14, w10, #0x1 +; NO_SVE-NEXT: fmov s1, w12 +; NO_SVE-NEXT: and w12, w11, #0x1 +; NO_SVE-NEXT: fmov s3, w13 +; NO_SVE-NEXT: ubfx w13, w8, #1, #1 +; NO_SVE-NEXT: fmov s2, w14 +; NO_SVE-NEXT: ubfx w14, w9, #1, #1 +; NO_SVE-NEXT: fmov s0, w12 +; NO_SVE-NEXT: ubfx w12, w10, #1, #1 +; NO_SVE-NEXT: mov v1.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w11, #1, #1 +; NO_SVE-NEXT: mov v3.b[1], w14 +; NO_SVE-NEXT: ubfx w14, w8, #2, #1 +; NO_SVE-NEXT: mov v2.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w9, #2, #1 +; NO_SVE-NEXT: mov v0.b[1], w13 +; NO_SVE-NEXT: ubfx w13, w10, #2, #1 +; NO_SVE-NEXT: mov v1.b[2], w14 +; NO_SVE-NEXT: ubfx w14, w11, #2, #1 +; NO_SVE-NEXT: mov v3.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w8, #3, #1 +; NO_SVE-NEXT: mov v2.b[2], w13 +; NO_SVE-NEXT: ubfx w13, w9, #3, #1 +; NO_SVE-NEXT: mov v0.b[2], w14 +; NO_SVE-NEXT: ubfx w14, w10, #3, #1 +; NO_SVE-NEXT: mov v1.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w11, #3, #1 +; NO_SVE-NEXT: mov v3.b[3], w13 +; NO_SVE-NEXT: ubfx w13, w8, #4, #1 +; NO_SVE-NEXT: mov v2.b[3], w14 +; NO_SVE-NEXT: ubfx w14, w9, #4, #1 +; NO_SVE-NEXT: mov v0.b[3], w12 +; NO_SVE-NEXT: ubfx w12, w10, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w11, #4, #1 +; NO_SVE-NEXT: mov v3.b[4], w14 +; NO_SVE-NEXT: ubfx w14, w8, #5, #1 +; NO_SVE-NEXT: mov v2.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w9, #5, #1 +; NO_SVE-NEXT: mov v0.b[4], w13 +; NO_SVE-NEXT: ubfx w13, w10, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w14 +; NO_SVE-NEXT: ubfx w14, w11, #5, #1 +; NO_SVE-NEXT: mov v3.b[5], w12 +; NO_SVE-NEXT: ubfx w12, w8, #6, #1 +; NO_SVE-NEXT: mov v2.b[5], w13 +; NO_SVE-NEXT: ubfx w13, w9, #6, #1 +; NO_SVE-NEXT: mov v0.b[5], w14 +; NO_SVE-NEXT: ubfx w14, w10, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w11, #6, #1 +; NO_SVE-NEXT: mov v3.b[6], w13 +; NO_SVE-NEXT: ubfx w13, w8, #7, #1 +; NO_SVE-NEXT: mov v2.b[6], w14 +; NO_SVE-NEXT: ubfx w14, w9, #7, #1 +; NO_SVE-NEXT: mov v0.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w10, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w13 +; NO_SVE-NEXT: ubfx w13, w11, #7, #1 +; NO_SVE-NEXT: mov v3.b[7], w14 +; NO_SVE-NEXT: ubfx w14, w8, #8, #1 +; NO_SVE-NEXT: mov v2.b[7], w12 +; NO_SVE-NEXT: ubfx w12, w9, #8, #1 +; NO_SVE-NEXT: mov v0.b[7], w13 +; NO_SVE-NEXT: ubfx w13, w10, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w14 +; NO_SVE-NEXT: ubfx w14, w11, #8, #1 +; NO_SVE-NEXT: mov v3.b[8], w12 +; NO_SVE-NEXT: ubfx w12, w8, #9, #1 +; NO_SVE-NEXT: mov v2.b[8], w13 +; NO_SVE-NEXT: ubfx w13, w9, #9, #1 +; NO_SVE-NEXT: mov v0.b[8], w14 +; NO_SVE-NEXT: ubfx w14, w10, #9, #1 +; NO_SVE-NEXT: mov v1.b[9], w12 +; NO_SVE-NEXT: ubfx w12, w11, #9, #1 +; NO_SVE-NEXT: mov v3.b[9], w13 +; NO_SVE-NEXT: ubfx w13, w8, #10, #1 +; NO_SVE-NEXT: mov v2.b[9], w14 +; NO_SVE-NEXT: ubfx w14, w9, #10, #1 +; NO_SVE-NEXT: mov v0.b[9], w12 +; NO_SVE-NEXT: ubfx w12, w10, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w13 +; NO_SVE-NEXT: ubfx w13, w11, #10, #1 +; NO_SVE-NEXT: mov v3.b[10], w14 +; NO_SVE-NEXT: ubfx w14, w8, #11, #1 +; NO_SVE-NEXT: mov v2.b[10], w12 +; NO_SVE-NEXT: ubfx w12, w9, #11, #1 +; NO_SVE-NEXT: mov v0.b[10], w13 +; NO_SVE-NEXT: ubfx w13, w10, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w14 +; NO_SVE-NEXT: ubfx w14, w11, #11, #1 +; NO_SVE-NEXT: mov v3.b[11], w12 +; NO_SVE-NEXT: ubfx w12, w8, #12, #1 +; NO_SVE-NEXT: mov v2.b[11], w13 +; NO_SVE-NEXT: ubfx w13, w9, #12, #1 +; NO_SVE-NEXT: mov v0.b[11], w14 +; NO_SVE-NEXT: ubfx w14, w10, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w12 +; NO_SVE-NEXT: ubfx w12, w11, #12, #1 +; NO_SVE-NEXT: mov v3.b[12], w13 +; NO_SVE-NEXT: ubfx w13, w8, #13, #1 +; NO_SVE-NEXT: mov v2.b[12], w14 +; NO_SVE-NEXT: ubfx w14, w9, #13, #1 +; NO_SVE-NEXT: mov v0.b[12], w12 +; NO_SVE-NEXT: ubfx w12, w10, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w13 +; NO_SVE-NEXT: ubfx w13, w11, #13, #1 +; NO_SVE-NEXT: mov v3.b[13], w14 +; NO_SVE-NEXT: ubfx w14, w8, #14, #1 +; NO_SVE-NEXT: mov v2.b[13], w12 +; NO_SVE-NEXT: ubfx w12, w9, #14, #1 +; NO_SVE-NEXT: mov v0.b[13], w13 +; NO_SVE-NEXT: ubfx w13, w10, #14, #1 +; NO_SVE-NEXT: mov v1.b[14], w14 +; NO_SVE-NEXT: ubfx w14, w11, #14, #1 +; NO_SVE-NEXT: lsr w10, w10, #15 +; NO_SVE-NEXT: lsr w11, w11, #15 +; NO_SVE-NEXT: mov v2.b[14], w13 +; NO_SVE-NEXT: lsr w9, w9, #15 +; NO_SVE-NEXT: mov v0.b[14], w14 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v3.b[14], w12 +; NO_SVE-NEXT: ldp q19, q18, [x0, #96] +; NO_SVE-NEXT: mov v2.b[15], w10 +; NO_SVE-NEXT: mov v0.b[15], w11 +; NO_SVE-NEXT: mov v3.b[15], w9 +; NO_SVE-NEXT: mov v1.b[15], w8 +; NO_SVE-NEXT: ext v16.16b, v2.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q26, q25, [x1, #96] +; NO_SVE-NEXT: ext v17.16b, v3.16b, v3.16b, #8 +; NO_SVE-NEXT: zip1 v20.8b, v0.8b, v0.8b +; NO_SVE-NEXT: zip2 v22.8b, v0.8b, v0.8b +; NO_SVE-NEXT: zip1 v23.8b, v16.8b, v0.8b +; NO_SVE-NEXT: zip2 v16.8b, v16.8b, v0.8b +; NO_SVE-NEXT: zip1 v24.8b, v17.8b, v0.8b +; NO_SVE-NEXT: zip2 v17.8b, v17.8b, v0.8b +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: ushll v23.4s, v23.4h, #0 +; NO_SVE-NEXT: ushll v16.4s, v16.4h, #0 +; NO_SVE-NEXT: shl v23.4s, v23.4s, #31 +; NO_SVE-NEXT: shl v16.4s, v16.4s, #31 +; NO_SVE-NEXT: cmlt v23.4s, v23.4s, #0 +; NO_SVE-NEXT: cmlt v16.4s, v16.4s, #0 +; NO_SVE-NEXT: ushll v17.4s, v17.4h, #0 +; NO_SVE-NEXT: bif v19.16b, v26.16b, v23.16b +; NO_SVE-NEXT: ldp q26, q23, [x1, #160] +; NO_SVE-NEXT: ushll v20.4s, v20.4h, #0 +; NO_SVE-NEXT: shl v17.4s, v17.4s, #31 +; NO_SVE-NEXT: bsl v16.16b, v18.16b, v25.16b +; NO_SVE-NEXT: zip1 v18.8b, v2.8b, v0.8b +; NO_SVE-NEXT: ushll v22.4s, v22.4h, #0 +; NO_SVE-NEXT: shl v20.4s, v20.4s, #31 +; NO_SVE-NEXT: cmlt v17.4s, v17.4s, #0 +; NO_SVE-NEXT: ldp q5, q4, [x0] +; NO_SVE-NEXT: cmlt v20.4s, v20.4s, #0 +; NO_SVE-NEXT: shl v22.4s, v22.4s, #31 +; NO_SVE-NEXT: ushll v18.4s, v18.4h, #0 +; NO_SVE-NEXT: bif v6.16b, v23.16b, v17.16b +; NO_SVE-NEXT: ushll v24.4s, v24.4h, #0 +; NO_SVE-NEXT: bif v5.16b, v21.16b, v20.16b +; NO_SVE-NEXT: ldr q17, [x1, #16] +; NO_SVE-NEXT: shl v18.4s, v18.4s, #31 +; NO_SVE-NEXT: cmlt v20.4s, v22.4s, #0 +; NO_SVE-NEXT: zip1 v22.8b, v3.8b, v0.8b +; NO_SVE-NEXT: shl v24.4s, v24.4s, #31 +; NO_SVE-NEXT: ldp q21, q25, [x1, #64] +; NO_SVE-NEXT: bif v4.16b, v17.16b, v20.16b +; NO_SVE-NEXT: zip2 v3.8b, v3.8b, v0.8b +; NO_SVE-NEXT: cmlt v17.4s, v18.4s, #0 +; NO_SVE-NEXT: cmlt v24.4s, v24.4s, #0 +; NO_SVE-NEXT: ushll v20.4s, v22.4h, #0 +; NO_SVE-NEXT: bif v7.16b, v26.16b, v24.16b +; NO_SVE-NEXT: ldp q23, q18, [x0, #64] +; NO_SVE-NEXT: shl v20.4s, v20.4s, #31 +; NO_SVE-NEXT: zip2 v26.8b, v1.8b, v0.8b +; NO_SVE-NEXT: ushll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; NO_SVE-NEXT: cmlt v20.4s, v20.4s, #0 +; NO_SVE-NEXT: bsl v17.16b, v23.16b, v21.16b +; NO_SVE-NEXT: shl v3.4s, v3.4s, #31 +; NO_SVE-NEXT: ldp q22, q21, [x0, #128] +; NO_SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: shl v2.4s, v2.4s, #31 +; NO_SVE-NEXT: ldp q24, q23, [x1, #128] +; NO_SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; NO_SVE-NEXT: bsl v2.16b, v18.16b, v25.16b +; NO_SVE-NEXT: bsl v20.16b, v22.16b, v24.16b +; NO_SVE-NEXT: zip1 v24.8b, v1.8b, v0.8b +; NO_SVE-NEXT: bsl v3.16b, v21.16b, v23.16b +; NO_SVE-NEXT: ushll v21.4s, v26.4h, #0 +; NO_SVE-NEXT: ldp q23, q22, [x0, #192] +; NO_SVE-NEXT: ushll v24.4s, v24.4h, #0 +; NO_SVE-NEXT: shl v21.4s, v21.4s, #31 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NO_SVE-NEXT: shl v24.4s, v24.4s, #31 +; NO_SVE-NEXT: cmlt v21.4s, v21.4s, #0 +; NO_SVE-NEXT: cmlt v24.4s, v24.4s, #0 +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: bif v23.16b, v27.16b, v24.16b +; NO_SVE-NEXT: bsl v21.16b, v22.16b, v26.16b +; NO_SVE-NEXT: ldp q26, q22, [x1, #224] +; NO_SVE-NEXT: ldp q27, q24, [x1, #32] +; NO_SVE-NEXT: stp q5, q4, [x0] +; NO_SVE-NEXT: stp q17, q2, [x0, #64] +; NO_SVE-NEXT: stp q19, q16, [x0, #96] +; NO_SVE-NEXT: stp q20, q3, [x0, #128] +; NO_SVE-NEXT: stp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q23, q21, [x0, #192] +; NO_SVE-NEXT: zip1 v21.8b, v1.8b, v0.8b +; NO_SVE-NEXT: zip2 v1.8b, v1.8b, v0.8b +; NO_SVE-NEXT: ldp q18, q2, [x0, #224] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: ushll v3.4s, v21.4h, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: shl v3.4s, v3.4s, #31 +; NO_SVE-NEXT: shl v1.4s, v1.4s, #31 +; NO_SVE-NEXT: zip1 v17.8b, v0.8b, v0.8b +; NO_SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; NO_SVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NO_SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bsl v1.16b, v2.16b, v22.16b +; NO_SVE-NEXT: mov v2.16b, v3.16b +; NO_SVE-NEXT: bsl v2.16b, v18.16b, v26.16b +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v3.4s, v17.4h, #0 +; NO_SVE-NEXT: shl v0.4s, v0.4s, #31 +; NO_SVE-NEXT: stp q2, q1, [x0, #224] +; NO_SVE-NEXT: shl v3.4s, v3.4s, #31 +; NO_SVE-NEXT: ldp q2, q1, [x0, #32] +; NO_SVE-NEXT: cmlt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; NO_SVE-NEXT: bsl v0.16b, v1.16b, v24.16b +; NO_SVE-NEXT: mov v1.16b, v3.16b +; NO_SVE-NEXT: bsl v1.16b, v2.16b, v27.16b +; NO_SVE-NEXT: stp q1, q0, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -2028,8 +4311,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ptrue p1.s @@ -2153,6 +4436,14 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v1i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: tst w0, #0x1 +; NO_SVE-NEXT: csetm x8, ne +; NO_SVE-NEXT: fmov d2, x8 +; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 @@ -2166,6 +4457,14 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 { +; NO_SVE-LABEL: select_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: shl v2.2d, v2.2d, #63 +; NO_SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v2.2d, v2.2s, #0 @@ -2178,6 +4477,38 @@ } define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, <4 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrb w8, [x2] +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: ubfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: ubfx w9, w8, #2, #1 +; NO_SVE-NEXT: ubfx w8, w8, #3, #1 +; NO_SVE-NEXT: ldp q4, q5, [x1] +; NO_SVE-NEXT: mov v0.h[1], w10 +; NO_SVE-NEXT: mov v0.h[2], w9 +; NO_SVE-NEXT: mov v0.h[3], w8 +; NO_SVE-NEXT: umov w8, v0.h[0] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[1] +; NO_SVE-NEXT: fmov s1, w8 +; NO_SVE-NEXT: umov w8, v0.h[3] +; NO_SVE-NEXT: fmov s0, w9 +; NO_SVE-NEXT: mov v1.s[1], w10 +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: ushll v1.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: shl v1.2d, v1.2d, #63 +; NO_SVE-NEXT: shl v0.2d, v0.2d, #63 +; NO_SVE-NEXT: cmlt v1.2d, v1.2d, #0 +; NO_SVE-NEXT: cmlt v0.2d, v0.2d, #0 +; NO_SVE-NEXT: bsl v1.16b, v2.16b, v4.16b +; NO_SVE-NEXT: bsl v0.16b, v3.16b, v5.16b +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -2186,8 +4517,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrb w8, [x2] ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p1.d @@ -2223,6 +4554,65 @@ } define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, <8 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrb w8, [x2] +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: ubfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s1, w9 +; NO_SVE-NEXT: ubfx w9, w8, #2, #1 +; NO_SVE-NEXT: ldp q16, q17, [x1, #32] +; NO_SVE-NEXT: mov v1.b[1], w10 +; NO_SVE-NEXT: mov v1.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #3, #1 +; NO_SVE-NEXT: mov v1.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #6, #1 +; NO_SVE-NEXT: lsr w8, w8, #7 +; NO_SVE-NEXT: mov v1.b[6], w9 +; NO_SVE-NEXT: mov v1.b[7], w8 +; NO_SVE-NEXT: umov w8, v1.b[2] +; NO_SVE-NEXT: umov w9, v1.b[4] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[6] +; NO_SVE-NEXT: fmov s3, w8 +; NO_SVE-NEXT: umov w8, v1.b[5] +; NO_SVE-NEXT: fmov s4, w9 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: fmov s5, w10 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: fmov s6, w11 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: mov v4.s[1], w8 +; NO_SVE-NEXT: mov v3.s[1], w9 +; NO_SVE-NEXT: mov v5.s[1], w10 +; NO_SVE-NEXT: mov v6.s[1], w11 +; NO_SVE-NEXT: ushll v4.2d, v4.2s, #0 +; NO_SVE-NEXT: ldp q1, q7, [x0] +; NO_SVE-NEXT: shl v4.2d, v4.2d, #63 +; NO_SVE-NEXT: ushll v3.2d, v3.2s, #0 +; NO_SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: bif v0.16b, v16.16b, v4.16b +; NO_SVE-NEXT: ldp q4, q16, [x1] +; NO_SVE-NEXT: shl v3.2d, v3.2d, #63 +; NO_SVE-NEXT: shl v5.2d, v5.2d, #63 +; NO_SVE-NEXT: shl v6.2d, v6.2d, #63 +; NO_SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; NO_SVE-NEXT: cmlt v6.2d, v6.2d, #0 +; NO_SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; NO_SVE-NEXT: bif v2.16b, v17.16b, v6.16b +; NO_SVE-NEXT: bif v1.16b, v4.16b, v5.16b +; NO_SVE-NEXT: bsl v3.16b, v7.16b, v16.16b +; NO_SVE-NEXT: stp q0, q2, [x0, #32] +; NO_SVE-NEXT: stp q1, q3, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -2231,8 +4621,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldrb w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ptrue p1.d @@ -2278,6 +4668,121 @@ } define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, <16 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldp q25, q24, [x1, #64] +; NO_SVE-NEXT: and w9, w8, #0x1 +; NO_SVE-NEXT: ubfx w10, w8, #1, #1 +; NO_SVE-NEXT: fmov s1, w9 +; NO_SVE-NEXT: ubfx w9, w8, #2, #1 +; NO_SVE-NEXT: ldp q7, q3, [x0, #80] +; NO_SVE-NEXT: mov v1.b[1], w10 +; NO_SVE-NEXT: mov v1.b[2], w9 +; NO_SVE-NEXT: ubfx w9, w8, #3, #1 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: ldp q16, q2, [x0, #48] +; NO_SVE-NEXT: mov v1.b[3], w9 +; NO_SVE-NEXT: ubfx w9, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[4], w9 +; NO_SVE-NEXT: ubfx w9, w8, #5, #1 +; NO_SVE-NEXT: ldr q23, [x0] +; NO_SVE-NEXT: mov v1.b[5], w9 +; NO_SVE-NEXT: ubfx w9, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[6], w9 +; NO_SVE-NEXT: ubfx w9, w8, #7, #1 +; NO_SVE-NEXT: mov v1.b[7], w9 +; NO_SVE-NEXT: ubfx w9, w8, #8, #1 +; NO_SVE-NEXT: mov v1.b[8], w9 +; NO_SVE-NEXT: ubfx w9, w8, #9, #1 +; NO_SVE-NEXT: mov v1.b[9], w9 +; NO_SVE-NEXT: ubfx w9, w8, #10, #1 +; NO_SVE-NEXT: mov v1.b[10], w9 +; NO_SVE-NEXT: ubfx w9, w8, #11, #1 +; NO_SVE-NEXT: mov v1.b[11], w9 +; NO_SVE-NEXT: ubfx w9, w8, #12, #1 +; NO_SVE-NEXT: mov v1.b[12], w9 +; NO_SVE-NEXT: ubfx w9, w8, #13, #1 +; NO_SVE-NEXT: mov v1.b[13], w9 +; NO_SVE-NEXT: ubfx w9, w8, #14, #1 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v1.b[14], w9 +; NO_SVE-NEXT: mov v1.b[15], w8 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[4] +; NO_SVE-NEXT: fmov s5, w9 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: fmov s6, w10 +; NO_SVE-NEXT: umov w10, v1.b[5] +; NO_SVE-NEXT: fmov s4, w11 +; NO_SVE-NEXT: umov w11, v1.b[8] +; NO_SVE-NEXT: mov v5.s[1], w12 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: mov v6.s[1], w9 +; NO_SVE-NEXT: umov w9, v1.b[6] +; NO_SVE-NEXT: mov v4.s[1], w10 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: fmov s20, w12 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: fmov s19, w11 +; NO_SVE-NEXT: fmov s18, w9 +; NO_SVE-NEXT: umov w9, v1.b[14] +; NO_SVE-NEXT: fmov s21, w10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: umov w10, v1.b[9] +; NO_SVE-NEXT: mov v20.s[1], w12 +; NO_SVE-NEXT: fmov s22, w9 +; NO_SVE-NEXT: umov w9, v1.b[13] +; NO_SVE-NEXT: mov v18.s[1], w11 +; NO_SVE-NEXT: ushll v20.2d, v20.2s, #0 +; NO_SVE-NEXT: mov v22.s[1], w8 +; NO_SVE-NEXT: mov v21.s[1], w9 +; NO_SVE-NEXT: shl v20.2d, v20.2d, #63 +; NO_SVE-NEXT: mov v19.s[1], w10 +; NO_SVE-NEXT: cmlt v20.2d, v20.2d, #0 +; NO_SVE-NEXT: ushll v21.2d, v21.2s, #0 +; NO_SVE-NEXT: bif v7.16b, v24.16b, v20.16b +; NO_SVE-NEXT: ldp q24, q20, [x1, #96] +; NO_SVE-NEXT: ushll v22.2d, v22.2s, #0 +; NO_SVE-NEXT: shl v21.2d, v21.2d, #63 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: shl v22.2d, v22.2d, #63 +; NO_SVE-NEXT: cmlt v21.2d, v21.2d, #0 +; NO_SVE-NEXT: shl v5.2d, v5.2d, #63 +; NO_SVE-NEXT: cmlt v22.2d, v22.2d, #0 +; NO_SVE-NEXT: ushll v4.2d, v4.2s, #0 +; NO_SVE-NEXT: ldp q1, q17, [x0, #16] +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: shl v4.2d, v4.2d, #63 +; NO_SVE-NEXT: bif v3.16b, v24.16b, v21.16b +; NO_SVE-NEXT: ushll v18.2d, v18.2s, #0 +; NO_SVE-NEXT: ushll v19.2d, v19.2s, #0 +; NO_SVE-NEXT: bif v0.16b, v20.16b, v22.16b +; NO_SVE-NEXT: ldp q24, q21, [x1, #32] +; NO_SVE-NEXT: shl v6.2d, v6.2d, #63 +; NO_SVE-NEXT: shl v18.2d, v18.2d, #63 +; NO_SVE-NEXT: shl v19.2d, v19.2d, #63 +; NO_SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; NO_SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; NO_SVE-NEXT: cmlt v6.2d, v6.2d, #0 +; NO_SVE-NEXT: ldp q22, q20, [x1] +; NO_SVE-NEXT: cmlt v19.2d, v19.2d, #0 +; NO_SVE-NEXT: stp q3, q0, [x0, #96] +; NO_SVE-NEXT: cmlt v18.2d, v18.2d, #0 +; NO_SVE-NEXT: mov v0.16b, v4.16b +; NO_SVE-NEXT: mov v3.16b, v5.16b +; NO_SVE-NEXT: bif v2.16b, v25.16b, v19.16b +; NO_SVE-NEXT: bif v16.16b, v21.16b, v18.16b +; NO_SVE-NEXT: bsl v0.16b, v17.16b, v24.16b +; NO_SVE-NEXT: bif v1.16b, v20.16b, v6.16b +; NO_SVE-NEXT: bsl v3.16b, v23.16b, v22.16b +; NO_SVE-NEXT: stp q2, q7, [x0, #64] +; NO_SVE-NEXT: stp q0, q16, [x0, #32] +; NO_SVE-NEXT: stp q3, q1, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -2286,8 +4791,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldrh w8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ptrue p1.d @@ -2353,6 +4858,227 @@ } define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 { +; NO_SVE-LABEL: select_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldrh w8, [x2] +; NO_SVE-NEXT: ldrh w9, [x2, #2] +; NO_SVE-NEXT: ldp q16, q6, [x0, #32] +; NO_SVE-NEXT: and w10, w8, #0x1 +; NO_SVE-NEXT: ubfx w12, w8, #1, #1 +; NO_SVE-NEXT: and w11, w9, #0x1 +; NO_SVE-NEXT: fmov s0, w10 +; NO_SVE-NEXT: ubfx w10, w9, #1, #1 +; NO_SVE-NEXT: fmov s1, w11 +; NO_SVE-NEXT: ubfx w11, w8, #2, #1 +; NO_SVE-NEXT: mov v0.b[1], w12 +; NO_SVE-NEXT: ubfx w12, w9, #2, #1 +; NO_SVE-NEXT: mov v1.b[1], w10 +; NO_SVE-NEXT: ubfx w10, w8, #3, #1 +; NO_SVE-NEXT: mov v0.b[2], w11 +; NO_SVE-NEXT: ubfx w11, w9, #3, #1 +; NO_SVE-NEXT: mov v1.b[2], w12 +; NO_SVE-NEXT: ubfx w12, w9, #4, #1 +; NO_SVE-NEXT: mov v0.b[3], w10 +; NO_SVE-NEXT: ubfx w10, w8, #4, #1 +; NO_SVE-NEXT: mov v1.b[3], w11 +; NO_SVE-NEXT: ubfx w11, w9, #5, #1 +; NO_SVE-NEXT: mov v0.b[4], w10 +; NO_SVE-NEXT: ubfx w10, w8, #5, #1 +; NO_SVE-NEXT: mov v1.b[4], w12 +; NO_SVE-NEXT: ubfx w12, w9, #6, #1 +; NO_SVE-NEXT: mov v0.b[5], w10 +; NO_SVE-NEXT: ubfx w10, w8, #6, #1 +; NO_SVE-NEXT: mov v1.b[5], w11 +; NO_SVE-NEXT: ubfx w11, w9, #7, #1 +; NO_SVE-NEXT: mov v0.b[6], w10 +; NO_SVE-NEXT: ubfx w10, w8, #7, #1 +; NO_SVE-NEXT: mov v1.b[6], w12 +; NO_SVE-NEXT: ubfx w12, w9, #8, #1 +; NO_SVE-NEXT: mov v0.b[7], w10 +; NO_SVE-NEXT: ubfx w10, w8, #8, #1 +; NO_SVE-NEXT: mov v1.b[7], w11 +; NO_SVE-NEXT: ubfx w11, w9, #9, #1 +; NO_SVE-NEXT: mov v0.b[8], w10 +; NO_SVE-NEXT: ubfx w10, w8, #9, #1 +; NO_SVE-NEXT: mov v1.b[8], w12 +; NO_SVE-NEXT: ubfx w12, w9, #10, #1 +; NO_SVE-NEXT: mov v0.b[9], w10 +; NO_SVE-NEXT: ubfx w10, w8, #10, #1 +; NO_SVE-NEXT: mov v1.b[9], w11 +; NO_SVE-NEXT: ubfx w11, w9, #11, #1 +; NO_SVE-NEXT: mov v0.b[10], w10 +; NO_SVE-NEXT: ubfx w10, w8, #11, #1 +; NO_SVE-NEXT: mov v1.b[10], w12 +; NO_SVE-NEXT: ubfx w12, w9, #12, #1 +; NO_SVE-NEXT: mov v0.b[11], w10 +; NO_SVE-NEXT: ubfx w10, w8, #12, #1 +; NO_SVE-NEXT: mov v1.b[11], w11 +; NO_SVE-NEXT: ubfx w11, w9, #13, #1 +; NO_SVE-NEXT: mov v0.b[12], w10 +; NO_SVE-NEXT: ubfx w10, w8, #13, #1 +; NO_SVE-NEXT: mov v1.b[12], w12 +; NO_SVE-NEXT: ubfx w12, w9, #14, #1 +; NO_SVE-NEXT: lsr w9, w9, #15 +; NO_SVE-NEXT: mov v0.b[13], w10 +; NO_SVE-NEXT: ubfx w10, w8, #14, #1 +; NO_SVE-NEXT: mov v1.b[13], w11 +; NO_SVE-NEXT: lsr w8, w8, #15 +; NO_SVE-NEXT: mov v0.b[14], w10 +; NO_SVE-NEXT: mov v1.b[14], w12 +; NO_SVE-NEXT: mov v0.b[15], w8 +; NO_SVE-NEXT: mov v1.b[15], w9 +; NO_SVE-NEXT: umov w11, v0.b[6] +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[8] +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: umov w15, v0.b[10] +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: umov w16, v1.b[2] +; NO_SVE-NEXT: fmov s3, w11 +; NO_SVE-NEXT: umov w11, v0.b[9] +; NO_SVE-NEXT: fmov s2, w10 +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: fmov s4, w13 +; NO_SVE-NEXT: fmov s5, w15 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: umov w13, v0.b[11] +; NO_SVE-NEXT: mov v2.s[1], w12 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: mov v4.s[1], w11 +; NO_SVE-NEXT: umov w11, v1.b[4] +; NO_SVE-NEXT: fmov s7, w10 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: mov v3.s[1], w14 +; NO_SVE-NEXT: umov w14, v1.b[8] +; NO_SVE-NEXT: fmov s19, w12 +; NO_SVE-NEXT: mov v7.s[1], w15 +; NO_SVE-NEXT: umov w15, v1.b[10] +; NO_SVE-NEXT: fmov s18, w11 +; NO_SVE-NEXT: umov w11, v1.b[12] +; NO_SVE-NEXT: umov w12, v1.b[14] +; NO_SVE-NEXT: mov v5.s[1], w13 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: fmov s20, w14 +; NO_SVE-NEXT: umov w14, v1.b[7] +; NO_SVE-NEXT: fmov s21, w15 +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: fmov s22, w11 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: fmov s23, w12 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: fmov s17, w16 +; NO_SVE-NEXT: mov v18.s[1], w13 +; NO_SVE-NEXT: umov w13, v0.b[14] +; NO_SVE-NEXT: mov v19.s[1], w14 +; NO_SVE-NEXT: mov v17.s[1], w10 +; NO_SVE-NEXT: mov v20.s[1], w15 +; NO_SVE-NEXT: mov v21.s[1], w11 +; NO_SVE-NEXT: mov v22.s[1], w12 +; NO_SVE-NEXT: mov v23.s[1], w9 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v3.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v4.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll v5.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll v7.2d, v17.2s, #0 +; NO_SVE-NEXT: ushll v17.2d, v18.2s, #0 +; NO_SVE-NEXT: ushll v18.2d, v19.2s, #0 +; NO_SVE-NEXT: ushll v19.2d, v20.2s, #0 +; NO_SVE-NEXT: ushll v20.2d, v21.2s, #0 +; NO_SVE-NEXT: ushll v21.2d, v22.2s, #0 +; NO_SVE-NEXT: ushll v22.2d, v23.2s, #0 +; NO_SVE-NEXT: ldp q24, q23, [x1, #32] +; NO_SVE-NEXT: shl v1.2d, v1.2d, #63 +; NO_SVE-NEXT: shl v2.2d, v2.2d, #63 +; NO_SVE-NEXT: shl v3.2d, v3.2d, #63 +; NO_SVE-NEXT: shl v4.2d, v4.2d, #63 +; NO_SVE-NEXT: cmlt v1.2d, v1.2d, #0 +; NO_SVE-NEXT: shl v5.2d, v5.2d, #63 +; NO_SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; NO_SVE-NEXT: bsl v1.16b, v16.16b, v24.16b +; NO_SVE-NEXT: ldp q24, q16, [x0, #64] +; NO_SVE-NEXT: bsl v2.16b, v6.16b, v23.16b +; NO_SVE-NEXT: shl v7.2d, v7.2d, #63 +; NO_SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; NO_SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; NO_SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; NO_SVE-NEXT: cmlt v7.2d, v7.2d, #0 +; NO_SVE-NEXT: ldp q23, q6, [x1, #64] +; NO_SVE-NEXT: shl v17.2d, v17.2d, #63 +; NO_SVE-NEXT: shl v18.2d, v18.2d, #63 +; NO_SVE-NEXT: shl v19.2d, v19.2d, #63 +; NO_SVE-NEXT: shl v20.2d, v20.2d, #63 +; NO_SVE-NEXT: cmlt v17.2d, v17.2d, #0 +; NO_SVE-NEXT: shl v21.2d, v21.2d, #63 +; NO_SVE-NEXT: bsl v3.16b, v24.16b, v23.16b +; NO_SVE-NEXT: cmlt v18.2d, v18.2d, #0 +; NO_SVE-NEXT: ldp q24, q23, [x0, #128] +; NO_SVE-NEXT: bsl v4.16b, v16.16b, v6.16b +; NO_SVE-NEXT: shl v22.2d, v22.2d, #63 +; NO_SVE-NEXT: cmlt v19.2d, v19.2d, #0 +; NO_SVE-NEXT: cmlt v20.2d, v20.2d, #0 +; NO_SVE-NEXT: cmlt v21.2d, v21.2d, #0 +; NO_SVE-NEXT: cmlt v22.2d, v22.2d, #0 +; NO_SVE-NEXT: ldp q16, q6, [x1, #128] +; NO_SVE-NEXT: umov w14, v0.b[12] +; NO_SVE-NEXT: umov w15, v0.b[13] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: bsl v5.16b, v24.16b, v16.16b +; NO_SVE-NEXT: ldp q24, q16, [x0, #160] +; NO_SVE-NEXT: bit v6.16b, v23.16b, v7.16b +; NO_SVE-NEXT: ldp q23, q7, [x1, #160] +; NO_SVE-NEXT: bsl v17.16b, v24.16b, v23.16b +; NO_SVE-NEXT: ldp q24, q23, [x0, #192] +; NO_SVE-NEXT: bit v7.16b, v16.16b, v18.16b +; NO_SVE-NEXT: ldp q18, q16, [x1, #192] +; NO_SVE-NEXT: bit v18.16b, v24.16b, v19.16b +; NO_SVE-NEXT: ldp q24, q19, [x0, #224] +; NO_SVE-NEXT: bit v16.16b, v23.16b, v20.16b +; NO_SVE-NEXT: ldp q23, q20, [x1, #224] +; NO_SVE-NEXT: bsl v21.16b, v24.16b, v23.16b +; NO_SVE-NEXT: bif v19.16b, v20.16b, v22.16b +; NO_SVE-NEXT: ldp q22, q20, [x1, #96] +; NO_SVE-NEXT: ldp q0, q23, [x1] +; NO_SVE-NEXT: stp q1, q2, [x0, #32] +; NO_SVE-NEXT: stp q3, q4, [x0, #64] +; NO_SVE-NEXT: stp q5, q6, [x0, #128] +; NO_SVE-NEXT: stp q17, q7, [x0, #160] +; NO_SVE-NEXT: stp q18, q16, [x0, #192] +; NO_SVE-NEXT: fmov s16, w14 +; NO_SVE-NEXT: stp q21, q19, [x0, #224] +; NO_SVE-NEXT: fmov s21, w13 +; NO_SVE-NEXT: ldp q17, q7, [x0, #96] +; NO_SVE-NEXT: mov v16.s[1], w15 +; NO_SVE-NEXT: fmov s19, w9 +; NO_SVE-NEXT: mov v21.s[1], w8 +; NO_SVE-NEXT: fmov s18, w11 +; NO_SVE-NEXT: mov v19.s[1], w10 +; NO_SVE-NEXT: ushll v16.2d, v16.2s, #0 +; NO_SVE-NEXT: ushll v21.2d, v21.2s, #0 +; NO_SVE-NEXT: shl v16.2d, v16.2d, #63 +; NO_SVE-NEXT: shl v21.2d, v21.2d, #63 +; NO_SVE-NEXT: cmlt v16.2d, v16.2d, #0 +; NO_SVE-NEXT: cmlt v21.2d, v21.2d, #0 +; NO_SVE-NEXT: mov v6.16b, v16.16b +; NO_SVE-NEXT: mov v18.s[1], w12 +; NO_SVE-NEXT: bif v7.16b, v20.16b, v21.16b +; NO_SVE-NEXT: bsl v6.16b, v17.16b, v22.16b +; NO_SVE-NEXT: ushll v16.2d, v19.2s, #0 +; NO_SVE-NEXT: ushll v5.2d, v18.2s, #0 +; NO_SVE-NEXT: shl v16.2d, v16.2d, #63 +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: shl v5.2d, v5.2d, #63 +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; NO_SVE-NEXT: cmlt v16.2d, v16.2d, #0 +; NO_SVE-NEXT: bit v0.16b, v7.16b, v16.16b +; NO_SVE-NEXT: bsl v5.16b, v6.16b, v23.16b +; NO_SVE-NEXT: stp q0, q5, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -2361,8 +5087,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr w8, [x2] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -25,6 +26,38 @@ ; define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldrb w8, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: ldrb w8, [x0, #1] +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB0_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[0], [x9] +; NO_SVE-NEXT: .LBB0_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[4], [x8] +; NO_SVE-NEXT: .LBB0_4: // %else2 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: strb w9, [x0] +; NO_SVE-NEXT: strb w8, [x0, #1] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] @@ -50,6 +83,56 @@ } define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: cmeq v0.4h, v0.4h, #0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB1_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB1_7 +; NO_SVE-NEXT: .LBB1_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB1_8 +; NO_SVE-NEXT: .LBB1_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB1_5 +; NO_SVE-NEXT: .LBB1_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[6], [x8] +; NO_SVE-NEXT: .LBB1_5: // %else8 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: str s0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB1_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[0], [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB1_2 +; NO_SVE-NEXT: .LBB1_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB1_3 +; NO_SVE-NEXT: .LBB1_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB1_4 +; NO_SVE-NEXT: b .LBB1_5 +; ; CHECK-LABEL: masked_gather_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] @@ -76,6 +159,95 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB2_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB2_11 +; NO_SVE-NEXT: .LBB2_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB2_12 +; NO_SVE-NEXT: .LBB2_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB2_13 +; NO_SVE-NEXT: .LBB2_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB2_14 +; NO_SVE-NEXT: .LBB2_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB2_15 +; NO_SVE-NEXT: .LBB2_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB2_16 +; NO_SVE-NEXT: .LBB2_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB2_9 +; NO_SVE-NEXT: .LBB2_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB2_9: // %else20 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB2_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB2_2 +; NO_SVE-NEXT: .LBB2_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB2_3 +; NO_SVE-NEXT: .LBB2_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB2_4 +; NO_SVE-NEXT: .LBB2_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB2_5 +; NO_SVE-NEXT: .LBB2_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB2_6 +; NO_SVE-NEXT: .LBB2_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB2_7 +; NO_SVE-NEXT: .LBB2_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB2_8 +; NO_SVE-NEXT: b .LBB2_9 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i8: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr d0, [x0] @@ -131,6 +303,230 @@ } define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB3_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB3_19 +; NO_SVE-NEXT: .LBB3_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB3_20 +; NO_SVE-NEXT: .LBB3_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB3_21 +; NO_SVE-NEXT: .LBB3_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB3_22 +; NO_SVE-NEXT: .LBB3_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB3_23 +; NO_SVE-NEXT: .LBB3_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB3_24 +; NO_SVE-NEXT: .LBB3_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB3_25 +; NO_SVE-NEXT: .LBB3_8: // %else20 +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB3_26 +; NO_SVE-NEXT: .LBB3_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB3_27 +; NO_SVE-NEXT: .LBB3_10: // %else26 +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB3_28 +; NO_SVE-NEXT: .LBB3_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB3_29 +; NO_SVE-NEXT: .LBB3_12: // %else32 +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB3_30 +; NO_SVE-NEXT: .LBB3_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB3_31 +; NO_SVE-NEXT: .LBB3_14: // %else38 +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB3_32 +; NO_SVE-NEXT: .LBB3_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB3_17 +; NO_SVE-NEXT: .LBB3_16: // %cond.load43 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB3_17: // %else44 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB3_18: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB3_2 +; NO_SVE-NEXT: .LBB3_19: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB3_3 +; NO_SVE-NEXT: .LBB3_20: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB3_4 +; NO_SVE-NEXT: .LBB3_21: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB3_5 +; NO_SVE-NEXT: .LBB3_22: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB3_6 +; NO_SVE-NEXT: .LBB3_23: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB3_7 +; NO_SVE-NEXT: .LBB3_24: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB3_8 +; NO_SVE-NEXT: .LBB3_25: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB3_9 +; NO_SVE-NEXT: .LBB3_26: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB3_10 +; NO_SVE-NEXT: .LBB3_27: // %cond.load25 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB3_11 +; NO_SVE-NEXT: .LBB3_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB3_12 +; NO_SVE-NEXT: .LBB3_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB3_13 +; NO_SVE-NEXT: .LBB3_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB3_14 +; NO_SVE-NEXT: .LBB3_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB3_15 +; NO_SVE-NEXT: .LBB3_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB3_16 +; NO_SVE-NEXT: b .LBB3_17 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ldr q0, [x0] +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: zip1 v2.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: zip2 v3.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: shl v3.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: sshr v1.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: ld1b { z2.d }, p1/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1b { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z4.d] +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: uzp1 v0.16b, v3.16b, v1.16b +; VBITS_EQ_256-NEXT: str q0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ldr q0, [x0] @@ -156,6 +552,509 @@ } define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[8] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[9] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[10] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[11] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w15, w14, #1, #1 +; NO_SVE-NEXT: umov w14, v1.b[9] +; NO_SVE-NEXT: bfi w15, w9, #2, #1 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: bfi w15, w9, #4, #1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[13] +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[8] +; NO_SVE-NEXT: umov w9, v0.b[14] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w11, w15, w11, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w11, w10, lsl #7 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #13 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #14 +; NO_SVE-NEXT: orr w9, w10, w11, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB4_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB4_3 +; NO_SVE-NEXT: b .LBB4_4 +; NO_SVE-NEXT: .LBB4_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB4_4 +; NO_SVE-NEXT: .LBB4_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB4_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB4_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB4_21 +; NO_SVE-NEXT: .LBB4_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB4_22 +; NO_SVE-NEXT: .LBB4_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB4_23 +; NO_SVE-NEXT: .LBB4_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB4_24 +; NO_SVE-NEXT: .LBB4_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB4_25 +; NO_SVE-NEXT: .LBB4_10: // %else20 +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB4_26 +; NO_SVE-NEXT: .LBB4_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB4_27 +; NO_SVE-NEXT: .LBB4_12: // %else26 +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB4_28 +; NO_SVE-NEXT: .LBB4_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB4_29 +; NO_SVE-NEXT: .LBB4_14: // %else32 +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB4_30 +; NO_SVE-NEXT: .LBB4_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB4_31 +; NO_SVE-NEXT: .LBB4_16: // %else38 +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB4_32 +; NO_SVE-NEXT: .LBB4_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB4_33 +; NO_SVE-NEXT: .LBB4_18: // %else44 +; NO_SVE-NEXT: ldr q2, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB4_34 +; NO_SVE-NEXT: .LBB4_19: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB4_35 +; NO_SVE-NEXT: b .LBB4_36 +; NO_SVE-NEXT: .LBB4_20: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB4_6 +; NO_SVE-NEXT: .LBB4_21: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB4_7 +; NO_SVE-NEXT: .LBB4_22: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB4_8 +; NO_SVE-NEXT: .LBB4_23: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB4_9 +; NO_SVE-NEXT: .LBB4_24: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB4_10 +; NO_SVE-NEXT: .LBB4_25: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB4_11 +; NO_SVE-NEXT: .LBB4_26: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB4_12 +; NO_SVE-NEXT: .LBB4_27: // %cond.load25 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB4_13 +; NO_SVE-NEXT: .LBB4_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB4_14 +; NO_SVE-NEXT: .LBB4_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB4_15 +; NO_SVE-NEXT: .LBB4_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB4_16 +; NO_SVE-NEXT: .LBB4_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB4_17 +; NO_SVE-NEXT: .LBB4_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB4_18 +; NO_SVE-NEXT: .LBB4_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB4_19 +; NO_SVE-NEXT: .LBB4_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #17, .LBB4_36 +; NO_SVE-NEXT: .LBB4_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB4_36: // %else50 +; NO_SVE-NEXT: ldr q2, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB4_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB4_53 +; NO_SVE-NEXT: .LBB4_38: // %else56 +; NO_SVE-NEXT: ldr q2, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB4_54 +; NO_SVE-NEXT: .LBB4_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB4_55 +; NO_SVE-NEXT: .LBB4_40: // %else62 +; NO_SVE-NEXT: ldr q2, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB4_56 +; NO_SVE-NEXT: .LBB4_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB4_57 +; NO_SVE-NEXT: .LBB4_42: // %else68 +; NO_SVE-NEXT: ldr q2, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB4_58 +; NO_SVE-NEXT: .LBB4_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB4_59 +; NO_SVE-NEXT: .LBB4_44: // %else74 +; NO_SVE-NEXT: ldr q2, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB4_60 +; NO_SVE-NEXT: .LBB4_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB4_61 +; NO_SVE-NEXT: .LBB4_46: // %else80 +; NO_SVE-NEXT: ldr q2, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB4_62 +; NO_SVE-NEXT: .LBB4_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB4_63 +; NO_SVE-NEXT: .LBB4_48: // %else86 +; NO_SVE-NEXT: ldr q2, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB4_64 +; NO_SVE-NEXT: .LBB4_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB4_51 +; NO_SVE-NEXT: .LBB4_50: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[15], [x8] +; NO_SVE-NEXT: .LBB4_51: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB4_52: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB4_38 +; NO_SVE-NEXT: .LBB4_53: // %cond.load55 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB4_39 +; NO_SVE-NEXT: .LBB4_54: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB4_40 +; NO_SVE-NEXT: .LBB4_55: // %cond.load61 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB4_41 +; NO_SVE-NEXT: .LBB4_56: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB4_42 +; NO_SVE-NEXT: .LBB4_57: // %cond.load67 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB4_43 +; NO_SVE-NEXT: .LBB4_58: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB4_44 +; NO_SVE-NEXT: .LBB4_59: // %cond.load73 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB4_45 +; NO_SVE-NEXT: .LBB4_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB4_46 +; NO_SVE-NEXT: .LBB4_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB4_47 +; NO_SVE-NEXT: .LBB4_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB4_48 +; NO_SVE-NEXT: .LBB4_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB4_49 +; NO_SVE-NEXT: .LBB4_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB4_50 +; NO_SVE-NEXT: b .LBB4_51 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_EQ_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_EQ_256-NEXT: mov x29, sp +; VBITS_EQ_256-NEXT: .cfi_def_cfa w29, 16 +; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 +; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_256-NEXT: sub x9, sp, #48 +; VBITS_EQ_256-NEXT: and sp, x9, #0xffffffffffffffe0 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_EQ_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: zip2 v2.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: shl v3.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: sshr v5.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: mov x8, #20 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1b { z5.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: zip1 v7.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: umov w8, v5.h[3] +; VBITS_EQ_256-NEXT: umov w9, v5.h[2] +; VBITS_EQ_256-NEXT: umov w10, v5.h[1] +; VBITS_EQ_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v5.h[0] +; VBITS_EQ_256-NEXT: mov z5.d, z4.d +; VBITS_EQ_256-NEXT: sunpklo z7.s, z7.h +; VBITS_EQ_256-NEXT: ext z5.b, z5.b, z4.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #7] +; VBITS_EQ_256-NEXT: strb w9, [sp, #6] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #5] +; VBITS_EQ_256-NEXT: strb w11, [sp, #4] +; VBITS_EQ_256-NEXT: ld1b { z7.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: zip2 v17.8b, v5.8b, v0.8b +; VBITS_EQ_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: shl v17.4h, v17.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_EQ_256-NEXT: umov w8, v7.h[3] +; VBITS_EQ_256-NEXT: umov w9, v7.h[2] +; VBITS_EQ_256-NEXT: umov w10, v7.h[1] +; VBITS_EQ_256-NEXT: sshr v17.4h, v17.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v7.h[0] +; VBITS_EQ_256-NEXT: sunpklo z7.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #3] +; VBITS_EQ_256-NEXT: strb w9, [sp, #2] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #1] +; VBITS_EQ_256-NEXT: strb w11, [sp] +; VBITS_EQ_256-NEXT: ld1b { z7.d }, p2/z, [z16.d] +; VBITS_EQ_256-NEXT: zip1 v16.8b, v5.8b, v0.8b +; VBITS_EQ_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: shl v16.4h, v16.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_EQ_256-NEXT: umov w8, v7.h[3] +; VBITS_EQ_256-NEXT: umov w9, v7.h[2] +; VBITS_EQ_256-NEXT: umov w10, v7.h[1] +; VBITS_EQ_256-NEXT: sshr v16.4h, v16.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v7.h[0] +; VBITS_EQ_256-NEXT: sunpklo z7.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #23] +; VBITS_EQ_256-NEXT: strb w9, [sp, #22] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #21] +; VBITS_EQ_256-NEXT: zip2 v7.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w11, [sp, #20] +; VBITS_EQ_256-NEXT: zip1 v4.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1b { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: umov w8, v6.h[3] +; VBITS_EQ_256-NEXT: umov w9, v6.h[2] +; VBITS_EQ_256-NEXT: umov w10, v6.h[1] +; VBITS_EQ_256-NEXT: umov w11, v6.h[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z7.h +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #19] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #18] +; VBITS_EQ_256-NEXT: strb w10, [sp, #17] +; VBITS_EQ_256-NEXT: strb w11, [sp, #16] +; VBITS_EQ_256-NEXT: ld1b { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: umov w8, v3.h[3] +; VBITS_EQ_256-NEXT: umov w9, v3.h[2] +; VBITS_EQ_256-NEXT: umov w10, v3.h[1] +; VBITS_EQ_256-NEXT: umov w11, v3.h[0] +; VBITS_EQ_256-NEXT: ext v3.16b, v5.16b, v5.16b, #8 +; VBITS_EQ_256-NEXT: strb w8, [sp, #15] +; VBITS_EQ_256-NEXT: strb w9, [sp, #14] +; VBITS_EQ_256-NEXT: strb w10, [sp, #13] +; VBITS_EQ_256-NEXT: zip2 v4.8b, v3.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w11, [sp, #12] +; VBITS_EQ_256-NEXT: ld1b { z2.d }, p2/z, [z2.d] +; VBITS_EQ_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: umov w8, v2.h[3] +; VBITS_EQ_256-NEXT: umov w9, v2.h[2] +; VBITS_EQ_256-NEXT: umov w10, v2.h[1] +; VBITS_EQ_256-NEXT: umov w11, v2.h[0] +; VBITS_EQ_256-NEXT: sunpklo z2.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #11] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #10] +; VBITS_EQ_256-NEXT: zip1 v2.8b, v3.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w10, [sp, #9] +; VBITS_EQ_256-NEXT: strb w11, [sp, #8] +; VBITS_EQ_256-NEXT: ld1b { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: umov w8, v1.h[3] +; VBITS_EQ_256-NEXT: umov w9, v1.h[2] +; VBITS_EQ_256-NEXT: umov w10, v1.h[1] +; VBITS_EQ_256-NEXT: umov w11, v1.h[0] +; VBITS_EQ_256-NEXT: sunpklo z1.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #31] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #30] +; VBITS_EQ_256-NEXT: strb w10, [sp, #29] +; VBITS_EQ_256-NEXT: strb w11, [sp, #28] +; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: umov w8, v0.h[3] +; VBITS_EQ_256-NEXT: umov w9, v0.h[2] +; VBITS_EQ_256-NEXT: umov w10, v0.h[1] +; VBITS_EQ_256-NEXT: umov w11, v0.h[0] +; VBITS_EQ_256-NEXT: strb w8, [sp, #27] +; VBITS_EQ_256-NEXT: strb w9, [sp, #26] +; VBITS_EQ_256-NEXT: strb w10, [sp, #25] +; VBITS_EQ_256-NEXT: strb w11, [sp, #24] +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [sp] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_EQ_256-NEXT: mov sp, x29 +; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 @@ -185,6 +1084,38 @@ ; define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldrh w8, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: ldrh w8, [x0, #2] +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB5_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[0], [x9] +; NO_SVE-NEXT: .LBB5_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB5_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[2], [x8] +; NO_SVE-NEXT: .LBB5_4: // %else2 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: strh w9, [x0] +; NO_SVE-NEXT: strh w8, [x0, #2] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] @@ -210,6 +1141,54 @@ } define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.4h, v0.4h, #0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB6_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB6_7 +; NO_SVE-NEXT: .LBB6_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB6_8 +; NO_SVE-NEXT: .LBB6_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB6_5 +; NO_SVE-NEXT: .LBB6_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x8] +; NO_SVE-NEXT: .LBB6_5: // %else8 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB6_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB6_2 +; NO_SVE-NEXT: .LBB6_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB6_3 +; NO_SVE-NEXT: .LBB6_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB6_4 +; NO_SVE-NEXT: b .LBB6_5 +; ; CHECK-LABEL: masked_gather_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -234,6 +1213,96 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB7_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB7_11 +; NO_SVE-NEXT: .LBB7_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB7_12 +; NO_SVE-NEXT: .LBB7_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB7_13 +; NO_SVE-NEXT: .LBB7_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB7_14 +; NO_SVE-NEXT: .LBB7_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB7_15 +; NO_SVE-NEXT: .LBB7_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB7_16 +; NO_SVE-NEXT: .LBB7_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB7_9 +; NO_SVE-NEXT: .LBB7_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB7_9: // %else20 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB7_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB7_2 +; NO_SVE-NEXT: .LBB7_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB7_3 +; NO_SVE-NEXT: .LBB7_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB7_4 +; NO_SVE-NEXT: .LBB7_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB7_5 +; NO_SVE-NEXT: .LBB7_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB7_6 +; NO_SVE-NEXT: .LBB7_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB7_7 +; NO_SVE-NEXT: .LBB7_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB7_8 +; NO_SVE-NEXT: b .LBB7_9 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] @@ -282,6 +1351,225 @@ } define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB8_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB8_3 +; NO_SVE-NEXT: b .LBB8_4 +; NO_SVE-NEXT: .LBB8_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB8_4 +; NO_SVE-NEXT: .LBB8_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB8_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB8_13 +; NO_SVE-NEXT: .LBB8_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB8_14 +; NO_SVE-NEXT: .LBB8_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB8_15 +; NO_SVE-NEXT: .LBB8_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB8_16 +; NO_SVE-NEXT: .LBB8_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB8_17 +; NO_SVE-NEXT: .LBB8_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB8_18 +; NO_SVE-NEXT: .LBB8_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB8_19 +; NO_SVE-NEXT: b .LBB8_20 +; NO_SVE-NEXT: .LBB8_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB8_6 +; NO_SVE-NEXT: .LBB8_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB8_7 +; NO_SVE-NEXT: .LBB8_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB8_8 +; NO_SVE-NEXT: .LBB8_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB8_9 +; NO_SVE-NEXT: .LBB8_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB8_10 +; NO_SVE-NEXT: .LBB8_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB8_11 +; NO_SVE-NEXT: .LBB8_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB8_20 +; NO_SVE-NEXT: .LBB8_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB8_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB8_29 +; NO_SVE-NEXT: .LBB8_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB8_30 +; NO_SVE-NEXT: .LBB8_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB8_31 +; NO_SVE-NEXT: .LBB8_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB8_32 +; NO_SVE-NEXT: .LBB8_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB8_27 +; NO_SVE-NEXT: .LBB8_26: // %cond.load43 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x8] +; NO_SVE-NEXT: .LBB8_27: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB8_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB8_22 +; NO_SVE-NEXT: .LBB8_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB8_23 +; NO_SVE-NEXT: .LBB8_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB8_24 +; NO_SVE-NEXT: .LBB8_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB8_25 +; NO_SVE-NEXT: .LBB8_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB8_26 +; NO_SVE-NEXT: b .LBB8_27 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z0.h, #0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z4.s, z2.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z4.s +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -305,6 +1593,431 @@ } define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[2] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[3] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w9, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v2.b[4] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w15, w14, #1, #1 +; NO_SVE-NEXT: bfi w15, w9, #2, #1 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w11, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w15, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: umov w14, v0.b[1] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w15, w11, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB9_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB9_3 +; NO_SVE-NEXT: b .LBB9_4 +; NO_SVE-NEXT: .LBB9_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB9_4 +; NO_SVE-NEXT: .LBB9_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB9_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB9_13 +; NO_SVE-NEXT: .LBB9_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB9_14 +; NO_SVE-NEXT: .LBB9_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB9_15 +; NO_SVE-NEXT: .LBB9_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB9_16 +; NO_SVE-NEXT: .LBB9_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB9_17 +; NO_SVE-NEXT: .LBB9_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB9_18 +; NO_SVE-NEXT: .LBB9_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB9_19 +; NO_SVE-NEXT: b .LBB9_20 +; NO_SVE-NEXT: .LBB9_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB9_6 +; NO_SVE-NEXT: .LBB9_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB9_7 +; NO_SVE-NEXT: .LBB9_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB9_8 +; NO_SVE-NEXT: .LBB9_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB9_9 +; NO_SVE-NEXT: .LBB9_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB9_10 +; NO_SVE-NEXT: .LBB9_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB9_11 +; NO_SVE-NEXT: .LBB9_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB9_20 +; NO_SVE-NEXT: .LBB9_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB9_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB9_29 +; NO_SVE-NEXT: .LBB9_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB9_30 +; NO_SVE-NEXT: .LBB9_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB9_31 +; NO_SVE-NEXT: .LBB9_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB9_32 +; NO_SVE-NEXT: .LBB9_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB9_33 +; NO_SVE-NEXT: .LBB9_26: // %else44 +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB9_34 +; NO_SVE-NEXT: .LBB9_27: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB9_35 +; NO_SVE-NEXT: b .LBB9_36 +; NO_SVE-NEXT: .LBB9_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB9_22 +; NO_SVE-NEXT: .LBB9_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB9_23 +; NO_SVE-NEXT: .LBB9_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB9_24 +; NO_SVE-NEXT: .LBB9_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB9_25 +; NO_SVE-NEXT: .LBB9_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB9_26 +; NO_SVE-NEXT: .LBB9_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB9_27 +; NO_SVE-NEXT: .LBB9_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB9_36 +; NO_SVE-NEXT: .LBB9_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_36: // %else50 +; NO_SVE-NEXT: ldr q3, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB9_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB9_45 +; NO_SVE-NEXT: .LBB9_38: // %else56 +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB9_46 +; NO_SVE-NEXT: .LBB9_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB9_47 +; NO_SVE-NEXT: .LBB9_40: // %else62 +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB9_48 +; NO_SVE-NEXT: .LBB9_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB9_49 +; NO_SVE-NEXT: .LBB9_42: // %else68 +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB9_50 +; NO_SVE-NEXT: .LBB9_43: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB9_51 +; NO_SVE-NEXT: b .LBB9_52 +; NO_SVE-NEXT: .LBB9_44: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB9_38 +; NO_SVE-NEXT: .LBB9_45: // %cond.load55 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB9_39 +; NO_SVE-NEXT: .LBB9_46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB9_40 +; NO_SVE-NEXT: .LBB9_47: // %cond.load61 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB9_41 +; NO_SVE-NEXT: .LBB9_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB9_42 +; NO_SVE-NEXT: .LBB9_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB9_43 +; NO_SVE-NEXT: .LBB9_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB9_52 +; NO_SVE-NEXT: .LBB9_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_52: // %else74 +; NO_SVE-NEXT: ldr q4, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB9_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB9_61 +; NO_SVE-NEXT: .LBB9_54: // %else80 +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB9_62 +; NO_SVE-NEXT: .LBB9_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB9_63 +; NO_SVE-NEXT: .LBB9_56: // %else86 +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB9_64 +; NO_SVE-NEXT: .LBB9_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB9_59 +; NO_SVE-NEXT: .LBB9_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: .LBB9_59: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB9_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB9_54 +; NO_SVE-NEXT: .LBB9_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB9_55 +; NO_SVE-NEXT: .LBB9_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB9_56 +; NO_SVE-NEXT: .LBB9_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB9_57 +; NO_SVE-NEXT: .LBB9_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB9_58 +; NO_SVE-NEXT: b .LBB9_59 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z3.h, #0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z3.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z18.h +; VBITS_EQ_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z4.h, #0 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z16.h +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -332,6 +2045,32 @@ ; define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB10_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: .LBB10_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB10_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB10_4: // %else2 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -353,6 +2092,55 @@ } define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB11_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB11_7 +; NO_SVE-NEXT: .LBB11_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB11_8 +; NO_SVE-NEXT: .LBB11_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB11_5 +; NO_SVE-NEXT: .LBB11_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: .LBB11_5: // %else8 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB11_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB11_2 +; NO_SVE-NEXT: .LBB11_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB11_3 +; NO_SVE-NEXT: .LBB11_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB11_4 +; NO_SVE-NEXT: b .LBB11_5 +; ; CHECK-LABEL: masked_gather_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -375,6 +2163,97 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB12_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB12_3 +; NO_SVE-NEXT: b .LBB12_4 +; NO_SVE-NEXT: .LBB12_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB12_4 +; NO_SVE-NEXT: .LBB12_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB12_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB12_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB12_9 +; NO_SVE-NEXT: .LBB12_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB12_10 +; NO_SVE-NEXT: .LBB12_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB12_11 +; NO_SVE-NEXT: b .LBB12_12 +; NO_SVE-NEXT: .LBB12_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB12_6 +; NO_SVE-NEXT: .LBB12_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB12_7 +; NO_SVE-NEXT: .LBB12_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB12_12 +; NO_SVE-NEXT: .LBB12_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB12_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB12_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB12_15 +; NO_SVE-NEXT: .LBB12_14: // %cond.load19 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: .LBB12_15: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB12_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB12_14 +; NO_SVE-NEXT: b .LBB12_15 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 @@ -420,6 +2299,225 @@ } define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB13_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB13_3 +; NO_SVE-NEXT: b .LBB13_4 +; NO_SVE-NEXT: .LBB13_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB13_4 +; NO_SVE-NEXT: .LBB13_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB13_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB13_9 +; NO_SVE-NEXT: .LBB13_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB13_10 +; NO_SVE-NEXT: .LBB13_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB13_11 +; NO_SVE-NEXT: b .LBB13_12 +; NO_SVE-NEXT: .LBB13_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB13_6 +; NO_SVE-NEXT: .LBB13_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB13_7 +; NO_SVE-NEXT: .LBB13_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB13_12 +; NO_SVE-NEXT: .LBB13_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB13_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB13_17 +; NO_SVE-NEXT: .LBB13_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB13_18 +; NO_SVE-NEXT: .LBB13_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB13_19 +; NO_SVE-NEXT: b .LBB13_20 +; NO_SVE-NEXT: .LBB13_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB13_14 +; NO_SVE-NEXT: .LBB13_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB13_15 +; NO_SVE-NEXT: .LBB13_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB13_20 +; NO_SVE-NEXT: .LBB13_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB13_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB13_25 +; NO_SVE-NEXT: .LBB13_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB13_26 +; NO_SVE-NEXT: .LBB13_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB13_27 +; NO_SVE-NEXT: b .LBB13_28 +; NO_SVE-NEXT: .LBB13_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB13_22 +; NO_SVE-NEXT: .LBB13_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB13_23 +; NO_SVE-NEXT: .LBB13_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB13_28 +; NO_SVE-NEXT: .LBB13_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB13_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB13_31 +; NO_SVE-NEXT: .LBB13_30: // %cond.load43 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: .LBB13_31: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB13_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB13_30 +; NO_SVE-NEXT: b .LBB13_31 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_EQ_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -441,6 +2539,431 @@ } define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v3.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w10, v3.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x0, #32] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: bfi w9, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v3.b[6] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #6 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB14_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB14_3 +; NO_SVE-NEXT: b .LBB14_4 +; NO_SVE-NEXT: .LBB14_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB14_4 +; NO_SVE-NEXT: .LBB14_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB14_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB14_9 +; NO_SVE-NEXT: .LBB14_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB14_10 +; NO_SVE-NEXT: .LBB14_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB14_11 +; NO_SVE-NEXT: b .LBB14_12 +; NO_SVE-NEXT: .LBB14_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB14_6 +; NO_SVE-NEXT: .LBB14_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB14_7 +; NO_SVE-NEXT: .LBB14_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB14_12 +; NO_SVE-NEXT: .LBB14_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB14_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB14_17 +; NO_SVE-NEXT: .LBB14_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB14_18 +; NO_SVE-NEXT: .LBB14_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB14_19 +; NO_SVE-NEXT: b .LBB14_20 +; NO_SVE-NEXT: .LBB14_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB14_14 +; NO_SVE-NEXT: .LBB14_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB14_15 +; NO_SVE-NEXT: .LBB14_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB14_20 +; NO_SVE-NEXT: .LBB14_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB14_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB14_25 +; NO_SVE-NEXT: .LBB14_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB14_26 +; NO_SVE-NEXT: .LBB14_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB14_27 +; NO_SVE-NEXT: b .LBB14_28 +; NO_SVE-NEXT: .LBB14_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB14_22 +; NO_SVE-NEXT: .LBB14_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB14_23 +; NO_SVE-NEXT: .LBB14_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB14_28 +; NO_SVE-NEXT: .LBB14_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB14_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB14_33 +; NO_SVE-NEXT: .LBB14_30: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB14_34 +; NO_SVE-NEXT: .LBB14_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB14_35 +; NO_SVE-NEXT: b .LBB14_36 +; NO_SVE-NEXT: .LBB14_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB14_30 +; NO_SVE-NEXT: .LBB14_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB14_31 +; NO_SVE-NEXT: .LBB14_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #17, .LBB14_36 +; NO_SVE-NEXT: .LBB14_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_36: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB14_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB14_41 +; NO_SVE-NEXT: .LBB14_38: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB14_42 +; NO_SVE-NEXT: .LBB14_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB14_43 +; NO_SVE-NEXT: b .LBB14_44 +; NO_SVE-NEXT: .LBB14_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB14_38 +; NO_SVE-NEXT: .LBB14_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB14_39 +; NO_SVE-NEXT: .LBB14_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #21, .LBB14_44 +; NO_SVE-NEXT: .LBB14_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_44: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB14_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB14_49 +; NO_SVE-NEXT: .LBB14_46: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB14_50 +; NO_SVE-NEXT: .LBB14_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB14_51 +; NO_SVE-NEXT: b .LBB14_52 +; NO_SVE-NEXT: .LBB14_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB14_46 +; NO_SVE-NEXT: .LBB14_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB14_47 +; NO_SVE-NEXT: .LBB14_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #25, .LBB14_52 +; NO_SVE-NEXT: .LBB14_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_52: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB14_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB14_57 +; NO_SVE-NEXT: .LBB14_54: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB14_58 +; NO_SVE-NEXT: .LBB14_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB14_59 +; NO_SVE-NEXT: b .LBB14_60 +; NO_SVE-NEXT: .LBB14_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB14_54 +; NO_SVE-NEXT: .LBB14_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB14_55 +; NO_SVE-NEXT: .LBB14_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #29, .LBB14_60 +; NO_SVE-NEXT: .LBB14_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_60: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB14_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB14_63 +; NO_SVE-NEXT: .LBB14_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB14_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB14_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB14_62 +; NO_SVE-NEXT: b .LBB14_63 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpeq p4.s, p0/z, z2.s, #0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z3.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -467,6 +2990,20 @@ ; Scalarize 1 x i64 gathers define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v1i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: cbnz x8, .LBB15_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ldr d0, [x8] +; NO_SVE-NEXT: .LBB15_2: // %else +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -489,6 +3026,33 @@ } define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: xtn v0.2s, v0.2d +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB16_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: .LBB16_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB16_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x8] +; NO_SVE-NEXT: .LBB16_4: // %else2 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -508,6 +3072,57 @@ } define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbz w10, #0, .LBB17_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB17_3 +; NO_SVE-NEXT: b .LBB17_4 +; NO_SVE-NEXT: .LBB17_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB17_4 +; NO_SVE-NEXT: .LBB17_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB17_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB17_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB17_7 +; NO_SVE-NEXT: b .LBB17_8 +; NO_SVE-NEXT: .LBB17_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB17_8 +; NO_SVE-NEXT: .LBB17_7: // %cond.load7 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x8] +; NO_SVE-NEXT: .LBB17_8: // %else8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -527,6 +3142,104 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB18_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB18_3 +; NO_SVE-NEXT: b .LBB18_4 +; NO_SVE-NEXT: .LBB18_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB18_4 +; NO_SVE-NEXT: .LBB18_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB18_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB18_7 +; NO_SVE-NEXT: b .LBB18_8 +; NO_SVE-NEXT: .LBB18_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB18_8 +; NO_SVE-NEXT: .LBB18_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB18_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB18_11 +; NO_SVE-NEXT: b .LBB18_12 +; NO_SVE-NEXT: .LBB18_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB18_12 +; NO_SVE-NEXT: .LBB18_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB18_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB18_15 +; NO_SVE-NEXT: b .LBB18_16 +; NO_SVE-NEXT: .LBB18_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB18_16 +; NO_SVE-NEXT: .LBB18_15: // %cond.load19 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB18_16: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: masked_gather_v8i64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -562,6 +3275,224 @@ } define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: uzp1 v3.4s, v4.4s, v5.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v3.2d, v5.2d, #0 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: uzp1 v2.4s, v4.4s, v3.4s +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[3] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB19_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB19_3 +; NO_SVE-NEXT: b .LBB19_4 +; NO_SVE-NEXT: .LBB19_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB19_4 +; NO_SVE-NEXT: .LBB19_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB19_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB19_7 +; NO_SVE-NEXT: b .LBB19_8 +; NO_SVE-NEXT: .LBB19_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB19_8 +; NO_SVE-NEXT: .LBB19_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB19_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB19_11 +; NO_SVE-NEXT: b .LBB19_12 +; NO_SVE-NEXT: .LBB19_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB19_12 +; NO_SVE-NEXT: .LBB19_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB19_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB19_15 +; NO_SVE-NEXT: b .LBB19_16 +; NO_SVE-NEXT: .LBB19_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB19_16 +; NO_SVE-NEXT: .LBB19_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB19_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB19_19 +; NO_SVE-NEXT: b .LBB19_20 +; NO_SVE-NEXT: .LBB19_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB19_20 +; NO_SVE-NEXT: .LBB19_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB19_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB19_23 +; NO_SVE-NEXT: b .LBB19_24 +; NO_SVE-NEXT: .LBB19_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB19_24 +; NO_SVE-NEXT: .LBB19_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB19_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB19_27 +; NO_SVE-NEXT: b .LBB19_28 +; NO_SVE-NEXT: .LBB19_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB19_28 +; NO_SVE-NEXT: .LBB19_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB19_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB19_31 +; NO_SVE-NEXT: b .LBB19_32 +; NO_SVE-NEXT: .LBB19_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB19_32 +; NO_SVE-NEXT: .LBB19_31: // %cond.load43 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x8] +; NO_SVE-NEXT: .LBB19_32: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p3.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p4.d, p0/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -580,6 +3511,430 @@ } define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #160] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #128] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #192] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v2.8h +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v6.2d, v6.2d, #0 +; NO_SVE-NEXT: cmeq v7.2d, v7.2d, #0 +; NO_SVE-NEXT: ldp q3, q5, [x0] +; NO_SVE-NEXT: uzp1 v1.4s, v6.4s, v7.4s +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: ldp q16, q17, [x0, #32] +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: cmeq v6.2d, v16.2d, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v4.2d, v17.2d, #0 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: cmeq v2.2d, v5.2d, #0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v4.4s, v6.4s, v4.4s +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: umov w10, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: orr w8, w8, w9, lsl #9 +; NO_SVE-NEXT: umov w9, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[1] +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: umov w13, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB20_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB20_3 +; NO_SVE-NEXT: b .LBB20_4 +; NO_SVE-NEXT: .LBB20_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB20_4 +; NO_SVE-NEXT: .LBB20_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB20_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB20_7 +; NO_SVE-NEXT: b .LBB20_8 +; NO_SVE-NEXT: .LBB20_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB20_8 +; NO_SVE-NEXT: .LBB20_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB20_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB20_11 +; NO_SVE-NEXT: b .LBB20_12 +; NO_SVE-NEXT: .LBB20_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB20_12 +; NO_SVE-NEXT: .LBB20_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB20_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB20_15 +; NO_SVE-NEXT: b .LBB20_16 +; NO_SVE-NEXT: .LBB20_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB20_16 +; NO_SVE-NEXT: .LBB20_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB20_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB20_19 +; NO_SVE-NEXT: b .LBB20_20 +; NO_SVE-NEXT: .LBB20_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB20_20 +; NO_SVE-NEXT: .LBB20_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB20_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB20_23 +; NO_SVE-NEXT: b .LBB20_24 +; NO_SVE-NEXT: .LBB20_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB20_24 +; NO_SVE-NEXT: .LBB20_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB20_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB20_27 +; NO_SVE-NEXT: b .LBB20_28 +; NO_SVE-NEXT: .LBB20_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB20_28 +; NO_SVE-NEXT: .LBB20_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB20_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB20_31 +; NO_SVE-NEXT: b .LBB20_32 +; NO_SVE-NEXT: .LBB20_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB20_32 +; NO_SVE-NEXT: .LBB20_31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_32: // %else44 +; NO_SVE-NEXT: ldr q17, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB20_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB20_35 +; NO_SVE-NEXT: b .LBB20_36 +; NO_SVE-NEXT: .LBB20_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: tbz w8, #17, .LBB20_36 +; NO_SVE-NEXT: .LBB20_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_36: // %else50 +; NO_SVE-NEXT: ldr q18, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB20_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB20_39 +; NO_SVE-NEXT: b .LBB20_40 +; NO_SVE-NEXT: .LBB20_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: tbz w8, #19, .LBB20_40 +; NO_SVE-NEXT: .LBB20_39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_40: // %else56 +; NO_SVE-NEXT: ldr q19, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB20_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB20_43 +; NO_SVE-NEXT: b .LBB20_44 +; NO_SVE-NEXT: .LBB20_42: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: tbz w8, #21, .LBB20_44 +; NO_SVE-NEXT: .LBB20_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_44: // %else62 +; NO_SVE-NEXT: ldr q20, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB20_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d20 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB20_47 +; NO_SVE-NEXT: b .LBB20_48 +; NO_SVE-NEXT: .LBB20_46: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: tbz w8, #23, .LBB20_48 +; NO_SVE-NEXT: .LBB20_47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v20.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_48: // %else68 +; NO_SVE-NEXT: ldr q21, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB20_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB20_51 +; NO_SVE-NEXT: b .LBB20_52 +; NO_SVE-NEXT: .LBB20_50: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: tbz w8, #25, .LBB20_52 +; NO_SVE-NEXT: .LBB20_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_52: // %else74 +; NO_SVE-NEXT: ldr q22, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB20_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d22 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB20_55 +; NO_SVE-NEXT: b .LBB20_56 +; NO_SVE-NEXT: .LBB20_54: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: tbz w8, #27, .LBB20_56 +; NO_SVE-NEXT: .LBB20_55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v22.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_56: // %else80 +; NO_SVE-NEXT: ldr q23, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB20_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB20_59 +; NO_SVE-NEXT: b .LBB20_60 +; NO_SVE-NEXT: .LBB20_58: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: tbz w8, #29, .LBB20_60 +; NO_SVE-NEXT: .LBB20_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_60: // %else86 +; NO_SVE-NEXT: ldr q24, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB20_62 +; NO_SVE-NEXT: // %bb.61: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB20_63 +; NO_SVE-NEXT: b .LBB20_64 +; NO_SVE-NEXT: .LBB20_62: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: tbz w8, #31, .LBB20_64 +; NO_SVE-NEXT: .LBB20_63: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x8] +; NO_SVE-NEXT: .LBB20_64: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: stp q16, q17, [x0, #128] +; NO_SVE-NEXT: stp q18, q19, [x0, #160] +; NO_SVE-NEXT: stp q20, q21, [x0, #192] +; NO_SVE-NEXT: stp q22, q23, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z7.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -602,6 +3957,32 @@ ; define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, #0.0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[0] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB21_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: .LBB21_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB21_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x8] +; NO_SVE-NEXT: .LBB21_4: // %else2 +; NO_SVE-NEXT: str s0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] @@ -638,6 +4019,54 @@ } define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, #0.0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB22_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB22_7 +; NO_SVE-NEXT: .LBB22_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB22_8 +; NO_SVE-NEXT: .LBB22_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB22_5 +; NO_SVE-NEXT: .LBB22_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x8] +; NO_SVE-NEXT: .LBB22_5: // %else8 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB22_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB22_2 +; NO_SVE-NEXT: .LBB22_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB22_3 +; NO_SVE-NEXT: .LBB22_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB22_4 +; NO_SVE-NEXT: b .LBB22_5 +; ; CHECK-LABEL: masked_gather_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -661,6 +4090,121 @@ } define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB23_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB23_11 +; NO_SVE-NEXT: .LBB23_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB23_12 +; NO_SVE-NEXT: .LBB23_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB23_13 +; NO_SVE-NEXT: .LBB23_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB23_14 +; NO_SVE-NEXT: .LBB23_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB23_15 +; NO_SVE-NEXT: .LBB23_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB23_16 +; NO_SVE-NEXT: .LBB23_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB23_9 +; NO_SVE-NEXT: .LBB23_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB23_9: // %else20 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB23_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB23_2 +; NO_SVE-NEXT: .LBB23_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB23_3 +; NO_SVE-NEXT: .LBB23_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB23_4 +; NO_SVE-NEXT: .LBB23_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB23_5 +; NO_SVE-NEXT: .LBB23_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB23_6 +; NO_SVE-NEXT: .LBB23_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB23_7 +; NO_SVE-NEXT: .LBB23_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB23_8 +; NO_SVE-NEXT: b .LBB23_9 +; +; VBITS_EQ_256-LABEL: masked_gather_v8f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ldr q0, [x0] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: str q0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ldr q0, [x0] @@ -684,6 +4228,225 @@ } define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB24_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB24_3 +; NO_SVE-NEXT: b .LBB24_4 +; NO_SVE-NEXT: .LBB24_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB24_4 +; NO_SVE-NEXT: .LBB24_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB24_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB24_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB24_13 +; NO_SVE-NEXT: .LBB24_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB24_14 +; NO_SVE-NEXT: .LBB24_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB24_15 +; NO_SVE-NEXT: .LBB24_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB24_16 +; NO_SVE-NEXT: .LBB24_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB24_17 +; NO_SVE-NEXT: .LBB24_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB24_18 +; NO_SVE-NEXT: .LBB24_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB24_19 +; NO_SVE-NEXT: b .LBB24_20 +; NO_SVE-NEXT: .LBB24_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB24_6 +; NO_SVE-NEXT: .LBB24_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB24_7 +; NO_SVE-NEXT: .LBB24_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB24_8 +; NO_SVE-NEXT: .LBB24_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB24_9 +; NO_SVE-NEXT: .LBB24_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB24_10 +; NO_SVE-NEXT: .LBB24_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB24_11 +; NO_SVE-NEXT: .LBB24_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB24_20 +; NO_SVE-NEXT: .LBB24_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB24_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB24_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB24_29 +; NO_SVE-NEXT: .LBB24_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB24_30 +; NO_SVE-NEXT: .LBB24_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB24_31 +; NO_SVE-NEXT: .LBB24_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB24_32 +; NO_SVE-NEXT: .LBB24_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB24_27 +; NO_SVE-NEXT: .LBB24_26: // %cond.load43 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x8] +; NO_SVE-NEXT: .LBB24_27: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB24_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB24_22 +; NO_SVE-NEXT: .LBB24_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB24_23 +; NO_SVE-NEXT: .LBB24_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB24_24 +; NO_SVE-NEXT: .LBB24_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB24_25 +; NO_SVE-NEXT: .LBB24_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB24_26 +; NO_SVE-NEXT: b .LBB24_27 +; +; VBITS_EQ_256-LABEL: masked_gather_v16f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z4.s, z2.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z4.s +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -707,6 +4470,431 @@ } define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, #0.0 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[0] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v2.b[5] +; NO_SVE-NEXT: bfi w14, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v3.8h, #0.0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[6] +; NO_SVE-NEXT: bfi w14, w15, #2, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w14, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w14, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w14, w11, #5, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w11, w14, w11, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #13 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB25_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB25_3 +; NO_SVE-NEXT: b .LBB25_4 +; NO_SVE-NEXT: .LBB25_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB25_4 +; NO_SVE-NEXT: .LBB25_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB25_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB25_13 +; NO_SVE-NEXT: .LBB25_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB25_14 +; NO_SVE-NEXT: .LBB25_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB25_15 +; NO_SVE-NEXT: .LBB25_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB25_16 +; NO_SVE-NEXT: .LBB25_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB25_17 +; NO_SVE-NEXT: .LBB25_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB25_18 +; NO_SVE-NEXT: .LBB25_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB25_19 +; NO_SVE-NEXT: b .LBB25_20 +; NO_SVE-NEXT: .LBB25_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB25_6 +; NO_SVE-NEXT: .LBB25_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB25_7 +; NO_SVE-NEXT: .LBB25_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB25_8 +; NO_SVE-NEXT: .LBB25_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB25_9 +; NO_SVE-NEXT: .LBB25_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB25_10 +; NO_SVE-NEXT: .LBB25_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB25_11 +; NO_SVE-NEXT: .LBB25_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB25_20 +; NO_SVE-NEXT: .LBB25_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB25_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB25_29 +; NO_SVE-NEXT: .LBB25_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB25_30 +; NO_SVE-NEXT: .LBB25_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB25_31 +; NO_SVE-NEXT: .LBB25_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB25_32 +; NO_SVE-NEXT: .LBB25_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB25_33 +; NO_SVE-NEXT: .LBB25_26: // %else44 +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB25_34 +; NO_SVE-NEXT: .LBB25_27: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB25_35 +; NO_SVE-NEXT: b .LBB25_36 +; NO_SVE-NEXT: .LBB25_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB25_22 +; NO_SVE-NEXT: .LBB25_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB25_23 +; NO_SVE-NEXT: .LBB25_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB25_24 +; NO_SVE-NEXT: .LBB25_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB25_25 +; NO_SVE-NEXT: .LBB25_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB25_26 +; NO_SVE-NEXT: .LBB25_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB25_27 +; NO_SVE-NEXT: .LBB25_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB25_36 +; NO_SVE-NEXT: .LBB25_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_36: // %else50 +; NO_SVE-NEXT: ldr q3, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB25_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB25_45 +; NO_SVE-NEXT: .LBB25_38: // %else56 +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB25_46 +; NO_SVE-NEXT: .LBB25_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB25_47 +; NO_SVE-NEXT: .LBB25_40: // %else62 +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB25_48 +; NO_SVE-NEXT: .LBB25_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB25_49 +; NO_SVE-NEXT: .LBB25_42: // %else68 +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB25_50 +; NO_SVE-NEXT: .LBB25_43: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB25_51 +; NO_SVE-NEXT: b .LBB25_52 +; NO_SVE-NEXT: .LBB25_44: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB25_38 +; NO_SVE-NEXT: .LBB25_45: // %cond.load55 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB25_39 +; NO_SVE-NEXT: .LBB25_46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB25_40 +; NO_SVE-NEXT: .LBB25_47: // %cond.load61 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB25_41 +; NO_SVE-NEXT: .LBB25_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB25_42 +; NO_SVE-NEXT: .LBB25_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB25_43 +; NO_SVE-NEXT: .LBB25_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB25_52 +; NO_SVE-NEXT: .LBB25_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_52: // %else74 +; NO_SVE-NEXT: ldr q4, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB25_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB25_61 +; NO_SVE-NEXT: .LBB25_54: // %else80 +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB25_62 +; NO_SVE-NEXT: .LBB25_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB25_63 +; NO_SVE-NEXT: .LBB25_56: // %else86 +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB25_64 +; NO_SVE-NEXT: .LBB25_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB25_59 +; NO_SVE-NEXT: .LBB25_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: .LBB25_59: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB25_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB25_54 +; NO_SVE-NEXT: .LBB25_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB25_55 +; NO_SVE-NEXT: .LBB25_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB25_56 +; NO_SVE-NEXT: .LBB25_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB25_57 +; NO_SVE-NEXT: .LBB25_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB25_58 +; NO_SVE-NEXT: b .LBB25_59 +; +; VBITS_EQ_256-LABEL: masked_gather_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z3.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z18.h +; VBITS_EQ_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z4.h, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z16.h +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -734,6 +4922,32 @@ ; define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.2s, v0.2s, #0.0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB26_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: .LBB26_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB26_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB26_4: // %else2 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -755,6 +4969,55 @@ } define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB27_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB27_7 +; NO_SVE-NEXT: .LBB27_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB27_8 +; NO_SVE-NEXT: .LBB27_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB27_5 +; NO_SVE-NEXT: .LBB27_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: .LBB27_5: // %else8 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB27_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB27_2 +; NO_SVE-NEXT: .LBB27_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB27_3 +; NO_SVE-NEXT: .LBB27_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB27_4 +; NO_SVE-NEXT: b .LBB27_5 +; ; CHECK-LABEL: masked_gather_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -776,6 +5039,121 @@ } define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB28_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB28_3 +; NO_SVE-NEXT: b .LBB28_4 +; NO_SVE-NEXT: .LBB28_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB28_4 +; NO_SVE-NEXT: .LBB28_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB28_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB28_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB28_9 +; NO_SVE-NEXT: .LBB28_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB28_10 +; NO_SVE-NEXT: .LBB28_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB28_11 +; NO_SVE-NEXT: b .LBB28_12 +; NO_SVE-NEXT: .LBB28_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB28_6 +; NO_SVE-NEXT: .LBB28_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB28_7 +; NO_SVE-NEXT: .LBB28_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB28_12 +; NO_SVE-NEXT: .LBB28_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB28_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB28_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB28_15 +; NO_SVE-NEXT: .LBB28_14: // %cond.load19 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: .LBB28_15: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB28_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB28_14 +; NO_SVE-NEXT: b .LBB28_15 +; +; VBITS_EQ_256-LABEL: masked_gather_v8f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 @@ -797,6 +5175,225 @@ } define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB29_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB29_3 +; NO_SVE-NEXT: b .LBB29_4 +; NO_SVE-NEXT: .LBB29_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB29_4 +; NO_SVE-NEXT: .LBB29_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB29_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB29_9 +; NO_SVE-NEXT: .LBB29_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB29_10 +; NO_SVE-NEXT: .LBB29_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB29_11 +; NO_SVE-NEXT: b .LBB29_12 +; NO_SVE-NEXT: .LBB29_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB29_6 +; NO_SVE-NEXT: .LBB29_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB29_7 +; NO_SVE-NEXT: .LBB29_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB29_12 +; NO_SVE-NEXT: .LBB29_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB29_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB29_17 +; NO_SVE-NEXT: .LBB29_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB29_18 +; NO_SVE-NEXT: .LBB29_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB29_19 +; NO_SVE-NEXT: b .LBB29_20 +; NO_SVE-NEXT: .LBB29_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB29_14 +; NO_SVE-NEXT: .LBB29_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB29_15 +; NO_SVE-NEXT: .LBB29_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB29_20 +; NO_SVE-NEXT: .LBB29_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB29_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB29_25 +; NO_SVE-NEXT: .LBB29_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB29_26 +; NO_SVE-NEXT: .LBB29_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB29_27 +; NO_SVE-NEXT: b .LBB29_28 +; NO_SVE-NEXT: .LBB29_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB29_22 +; NO_SVE-NEXT: .LBB29_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB29_23 +; NO_SVE-NEXT: .LBB29_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB29_28 +; NO_SVE-NEXT: .LBB29_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB29_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB29_31 +; NO_SVE-NEXT: .LBB29_30: // %cond.load43 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: .LBB29_31: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB29_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB29_30 +; NO_SVE-NEXT: b .LBB29_31 +; +; VBITS_EQ_256-LABEL: masked_gather_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -818,6 +5415,432 @@ } define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v4.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v4.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v4.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v2.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v4.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB30_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB30_3 +; NO_SVE-NEXT: b .LBB30_4 +; NO_SVE-NEXT: .LBB30_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB30_4 +; NO_SVE-NEXT: .LBB30_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB30_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB30_9 +; NO_SVE-NEXT: .LBB30_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB30_10 +; NO_SVE-NEXT: .LBB30_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB30_11 +; NO_SVE-NEXT: b .LBB30_12 +; NO_SVE-NEXT: .LBB30_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB30_6 +; NO_SVE-NEXT: .LBB30_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB30_7 +; NO_SVE-NEXT: .LBB30_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB30_12 +; NO_SVE-NEXT: .LBB30_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB30_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB30_17 +; NO_SVE-NEXT: .LBB30_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB30_18 +; NO_SVE-NEXT: .LBB30_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB30_19 +; NO_SVE-NEXT: b .LBB30_20 +; NO_SVE-NEXT: .LBB30_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB30_14 +; NO_SVE-NEXT: .LBB30_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB30_15 +; NO_SVE-NEXT: .LBB30_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB30_20 +; NO_SVE-NEXT: .LBB30_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB30_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB30_25 +; NO_SVE-NEXT: .LBB30_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB30_26 +; NO_SVE-NEXT: .LBB30_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB30_27 +; NO_SVE-NEXT: b .LBB30_28 +; NO_SVE-NEXT: .LBB30_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB30_22 +; NO_SVE-NEXT: .LBB30_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB30_23 +; NO_SVE-NEXT: .LBB30_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB30_28 +; NO_SVE-NEXT: .LBB30_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB30_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB30_33 +; NO_SVE-NEXT: .LBB30_30: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB30_34 +; NO_SVE-NEXT: .LBB30_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB30_35 +; NO_SVE-NEXT: b .LBB30_36 +; NO_SVE-NEXT: .LBB30_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB30_30 +; NO_SVE-NEXT: .LBB30_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB30_31 +; NO_SVE-NEXT: .LBB30_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #17, .LBB30_36 +; NO_SVE-NEXT: .LBB30_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_36: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB30_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB30_41 +; NO_SVE-NEXT: .LBB30_38: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB30_42 +; NO_SVE-NEXT: .LBB30_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB30_43 +; NO_SVE-NEXT: b .LBB30_44 +; NO_SVE-NEXT: .LBB30_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB30_38 +; NO_SVE-NEXT: .LBB30_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB30_39 +; NO_SVE-NEXT: .LBB30_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #21, .LBB30_44 +; NO_SVE-NEXT: .LBB30_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_44: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB30_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB30_49 +; NO_SVE-NEXT: .LBB30_46: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB30_50 +; NO_SVE-NEXT: .LBB30_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB30_51 +; NO_SVE-NEXT: b .LBB30_52 +; NO_SVE-NEXT: .LBB30_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB30_46 +; NO_SVE-NEXT: .LBB30_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB30_47 +; NO_SVE-NEXT: .LBB30_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #25, .LBB30_52 +; NO_SVE-NEXT: .LBB30_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_52: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB30_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB30_57 +; NO_SVE-NEXT: .LBB30_54: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB30_58 +; NO_SVE-NEXT: .LBB30_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB30_59 +; NO_SVE-NEXT: b .LBB30_60 +; NO_SVE-NEXT: .LBB30_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB30_54 +; NO_SVE-NEXT: .LBB30_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB30_55 +; NO_SVE-NEXT: .LBB30_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #29, .LBB30_60 +; NO_SVE-NEXT: .LBB30_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_60: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB30_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB30_63 +; NO_SVE-NEXT: .LBB30_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB30_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB30_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB30_62 +; NO_SVE-NEXT: b .LBB30_63 +; +; VBITS_EQ_256-LABEL: masked_gather_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -844,6 +5867,20 @@ ; Scalarize 1 x double gathers define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcmp d0, #0.0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: b.ne .LBB31_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ldr d0, [x8] +; NO_SVE-NEXT: .LBB31_2: // %else +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -866,6 +5903,33 @@ } define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: xtn v0.2s, v0.2d +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB32_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: .LBB32_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB32_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x8] +; NO_SVE-NEXT: .LBB32_4: // %else2 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -885,6 +5949,57 @@ } define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbz w10, #0, .LBB33_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB33_3 +; NO_SVE-NEXT: b .LBB33_4 +; NO_SVE-NEXT: .LBB33_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB33_4 +; NO_SVE-NEXT: .LBB33_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB33_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB33_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB33_7 +; NO_SVE-NEXT: b .LBB33_8 +; NO_SVE-NEXT: .LBB33_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB33_8 +; NO_SVE-NEXT: .LBB33_7: // %cond.load7 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x8] +; NO_SVE-NEXT: .LBB33_8: // %else8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -903,6 +6018,120 @@ } define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB34_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB34_3 +; NO_SVE-NEXT: b .LBB34_4 +; NO_SVE-NEXT: .LBB34_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB34_4 +; NO_SVE-NEXT: .LBB34_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB34_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB34_7 +; NO_SVE-NEXT: b .LBB34_8 +; NO_SVE-NEXT: .LBB34_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB34_8 +; NO_SVE-NEXT: .LBB34_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB34_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB34_11 +; NO_SVE-NEXT: b .LBB34_12 +; NO_SVE-NEXT: .LBB34_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB34_12 +; NO_SVE-NEXT: .LBB34_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB34_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB34_15 +; NO_SVE-NEXT: b .LBB34_16 +; NO_SVE-NEXT: .LBB34_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB34_16 +; NO_SVE-NEXT: .LBB34_15: // %cond.load19 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB34_16: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v8f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -921,6 +6150,224 @@ } define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: uzp1 v3.4s, v4.4s, v5.4s +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v5.2d, v6.2d, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: fcmeq v4.2d, v7.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v2.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB35_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB35_3 +; NO_SVE-NEXT: b .LBB35_4 +; NO_SVE-NEXT: .LBB35_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB35_4 +; NO_SVE-NEXT: .LBB35_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB35_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB35_7 +; NO_SVE-NEXT: b .LBB35_8 +; NO_SVE-NEXT: .LBB35_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB35_8 +; NO_SVE-NEXT: .LBB35_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB35_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB35_11 +; NO_SVE-NEXT: b .LBB35_12 +; NO_SVE-NEXT: .LBB35_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB35_12 +; NO_SVE-NEXT: .LBB35_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB35_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB35_15 +; NO_SVE-NEXT: b .LBB35_16 +; NO_SVE-NEXT: .LBB35_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB35_16 +; NO_SVE-NEXT: .LBB35_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB35_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB35_19 +; NO_SVE-NEXT: b .LBB35_20 +; NO_SVE-NEXT: .LBB35_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB35_20 +; NO_SVE-NEXT: .LBB35_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB35_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB35_23 +; NO_SVE-NEXT: b .LBB35_24 +; NO_SVE-NEXT: .LBB35_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB35_24 +; NO_SVE-NEXT: .LBB35_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB35_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB35_27 +; NO_SVE-NEXT: b .LBB35_28 +; NO_SVE-NEXT: .LBB35_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB35_28 +; NO_SVE-NEXT: .LBB35_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB35_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB35_31 +; NO_SVE-NEXT: b .LBB35_32 +; NO_SVE-NEXT: .LBB35_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB35_32 +; NO_SVE-NEXT: .LBB35_31: // %cond.load43 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x8] +; NO_SVE-NEXT: .LBB35_32: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p4.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -939,6 +6386,430 @@ } define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #160] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #192] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ldp q3, q5, [x0, #32] +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: ldp q16, q17, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v5.2d, #0.0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v6.2d, v16.2d, #0.0 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: fcmeq v4.2d, v17.2d, #0.0 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v1.4s, v3.4s, v1.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: uzp1 v3.4s, v6.4s, v4.4s +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w9, w12, #0x1 +; NO_SVE-NEXT: umov w10, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w13, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #12 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: umov w13, v1.b[3] +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB36_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB36_3 +; NO_SVE-NEXT: b .LBB36_4 +; NO_SVE-NEXT: .LBB36_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB36_4 +; NO_SVE-NEXT: .LBB36_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB36_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB36_7 +; NO_SVE-NEXT: b .LBB36_8 +; NO_SVE-NEXT: .LBB36_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB36_8 +; NO_SVE-NEXT: .LBB36_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB36_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB36_11 +; NO_SVE-NEXT: b .LBB36_12 +; NO_SVE-NEXT: .LBB36_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB36_12 +; NO_SVE-NEXT: .LBB36_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB36_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB36_15 +; NO_SVE-NEXT: b .LBB36_16 +; NO_SVE-NEXT: .LBB36_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB36_16 +; NO_SVE-NEXT: .LBB36_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB36_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB36_19 +; NO_SVE-NEXT: b .LBB36_20 +; NO_SVE-NEXT: .LBB36_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB36_20 +; NO_SVE-NEXT: .LBB36_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB36_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB36_23 +; NO_SVE-NEXT: b .LBB36_24 +; NO_SVE-NEXT: .LBB36_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB36_24 +; NO_SVE-NEXT: .LBB36_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB36_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB36_27 +; NO_SVE-NEXT: b .LBB36_28 +; NO_SVE-NEXT: .LBB36_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB36_28 +; NO_SVE-NEXT: .LBB36_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB36_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB36_31 +; NO_SVE-NEXT: b .LBB36_32 +; NO_SVE-NEXT: .LBB36_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB36_32 +; NO_SVE-NEXT: .LBB36_31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_32: // %else44 +; NO_SVE-NEXT: ldr q17, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB36_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB36_35 +; NO_SVE-NEXT: b .LBB36_36 +; NO_SVE-NEXT: .LBB36_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: tbz w8, #17, .LBB36_36 +; NO_SVE-NEXT: .LBB36_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_36: // %else50 +; NO_SVE-NEXT: ldr q18, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB36_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB36_39 +; NO_SVE-NEXT: b .LBB36_40 +; NO_SVE-NEXT: .LBB36_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: tbz w8, #19, .LBB36_40 +; NO_SVE-NEXT: .LBB36_39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_40: // %else56 +; NO_SVE-NEXT: ldr q19, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB36_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB36_43 +; NO_SVE-NEXT: b .LBB36_44 +; NO_SVE-NEXT: .LBB36_42: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: tbz w8, #21, .LBB36_44 +; NO_SVE-NEXT: .LBB36_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_44: // %else62 +; NO_SVE-NEXT: ldr q20, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB36_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d20 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB36_47 +; NO_SVE-NEXT: b .LBB36_48 +; NO_SVE-NEXT: .LBB36_46: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: tbz w8, #23, .LBB36_48 +; NO_SVE-NEXT: .LBB36_47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v20.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_48: // %else68 +; NO_SVE-NEXT: ldr q21, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB36_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB36_51 +; NO_SVE-NEXT: b .LBB36_52 +; NO_SVE-NEXT: .LBB36_50: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: tbz w8, #25, .LBB36_52 +; NO_SVE-NEXT: .LBB36_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_52: // %else74 +; NO_SVE-NEXT: ldr q22, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB36_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d22 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB36_55 +; NO_SVE-NEXT: b .LBB36_56 +; NO_SVE-NEXT: .LBB36_54: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: tbz w8, #27, .LBB36_56 +; NO_SVE-NEXT: .LBB36_55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v22.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_56: // %else80 +; NO_SVE-NEXT: ldr q23, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB36_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB36_59 +; NO_SVE-NEXT: b .LBB36_60 +; NO_SVE-NEXT: .LBB36_58: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: tbz w8, #29, .LBB36_60 +; NO_SVE-NEXT: .LBB36_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_60: // %else86 +; NO_SVE-NEXT: ldr q24, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB36_62 +; NO_SVE-NEXT: // %bb.61: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB36_63 +; NO_SVE-NEXT: b .LBB36_64 +; NO_SVE-NEXT: .LBB36_62: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: tbz w8, #31, .LBB36_64 +; NO_SVE-NEXT: .LBB36_63: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x8] +; NO_SVE-NEXT: .LBB36_64: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: stp q16, q17, [x0, #128] +; NO_SVE-NEXT: stp q18, q19, [x0, #160] +; NO_SVE-NEXT: stp q20, q21, [x0, #192] +; NO_SVE-NEXT: stp q22, q23, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z5.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z7.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -960,18 +6831,436 @@ ; modes still function define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v2.2s, #1 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v1.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB37_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: b .LBB37_3 +; NO_SVE-NEXT: .LBB37_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB37_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v2.2d, v2.4s, #1 +; NO_SVE-NEXT: tbnz w8, #1, .LBB37_13 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB37_14 +; NO_SVE-NEXT: .LBB37_5: // %else5 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #3, .LBB37_15 +; NO_SVE-NEXT: .LBB37_6: // %else8 +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB37_8 +; NO_SVE-NEXT: .LBB37_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_8: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #5, .LBB37_16 +; NO_SVE-NEXT: // %bb.9: // %else14 +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB37_17 +; NO_SVE-NEXT: .LBB37_10: // %else17 +; NO_SVE-NEXT: sshll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #7, .LBB37_18 +; NO_SVE-NEXT: .LBB37_11: // %else20 +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB37_19 +; NO_SVE-NEXT: .LBB37_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_20 +; NO_SVE-NEXT: .LBB37_13: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB37_5 +; NO_SVE-NEXT: .LBB37_14: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #3, .LBB37_6 +; NO_SVE-NEXT: .LBB37_15: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB37_7 +; NO_SVE-NEXT: b .LBB37_8 +; NO_SVE-NEXT: .LBB37_16: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB37_10 +; NO_SVE-NEXT: .LBB37_17: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: sshll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #7, .LBB37_11 +; NO_SVE-NEXT: .LBB37_18: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB37_12 +; NO_SVE-NEXT: .LBB37_19: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB37_20: // %else23 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #9, .LBB37_30 +; NO_SVE-NEXT: // %bb.21: // %else26 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB37_31 +; NO_SVE-NEXT: .LBB37_22: // %else29 +; NO_SVE-NEXT: sshll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #11, .LBB37_32 +; NO_SVE-NEXT: .LBB37_23: // %else32 +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB37_25 +; NO_SVE-NEXT: .LBB37_24: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_25: // %else35 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #13, .LBB37_33 +; NO_SVE-NEXT: // %bb.26: // %else38 +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB37_34 +; NO_SVE-NEXT: .LBB37_27: // %else41 +; NO_SVE-NEXT: sshll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #15, .LBB37_35 +; NO_SVE-NEXT: .LBB37_28: // %else44 +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB37_36 +; NO_SVE-NEXT: .LBB37_29: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_37 +; NO_SVE-NEXT: .LBB37_30: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB37_22 +; NO_SVE-NEXT: .LBB37_31: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: sshll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #11, .LBB37_23 +; NO_SVE-NEXT: .LBB37_32: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB37_24 +; NO_SVE-NEXT: b .LBB37_25 +; NO_SVE-NEXT: .LBB37_33: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB37_27 +; NO_SVE-NEXT: .LBB37_34: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: sshll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #15, .LBB37_28 +; NO_SVE-NEXT: .LBB37_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB37_29 +; NO_SVE-NEXT: .LBB37_36: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB37_37: // %else47 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #17, .LBB37_47 +; NO_SVE-NEXT: // %bb.38: // %else50 +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB37_48 +; NO_SVE-NEXT: .LBB37_39: // %else53 +; NO_SVE-NEXT: sshll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #19, .LBB37_49 +; NO_SVE-NEXT: .LBB37_40: // %else56 +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB37_42 +; NO_SVE-NEXT: .LBB37_41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_42: // %else59 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #21, .LBB37_50 +; NO_SVE-NEXT: // %bb.43: // %else62 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB37_51 +; NO_SVE-NEXT: .LBB37_44: // %else65 +; NO_SVE-NEXT: sshll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbnz w8, #23, .LBB37_52 +; NO_SVE-NEXT: .LBB37_45: // %else68 +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB37_53 +; NO_SVE-NEXT: .LBB37_46: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_54 +; NO_SVE-NEXT: .LBB37_47: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB37_39 +; NO_SVE-NEXT: .LBB37_48: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: sshll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #19, .LBB37_40 +; NO_SVE-NEXT: .LBB37_49: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB37_41 +; NO_SVE-NEXT: b .LBB37_42 +; NO_SVE-NEXT: .LBB37_50: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB37_44 +; NO_SVE-NEXT: .LBB37_51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: sshll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbz w8, #23, .LBB37_45 +; NO_SVE-NEXT: .LBB37_52: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB37_46 +; NO_SVE-NEXT: .LBB37_53: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB37_54: // %else71 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #1 +; NO_SVE-NEXT: tbnz w8, #25, .LBB37_63 +; NO_SVE-NEXT: // %bb.55: // %else74 +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB37_64 +; NO_SVE-NEXT: .LBB37_56: // %else77 +; NO_SVE-NEXT: sshll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #27, .LBB37_65 +; NO_SVE-NEXT: .LBB37_57: // %else80 +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB37_66 +; NO_SVE-NEXT: .LBB37_58: // %else83 +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #29, .LBB37_67 +; NO_SVE-NEXT: .LBB37_59: // %else86 +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB37_68 +; NO_SVE-NEXT: .LBB37_60: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB37_62 +; NO_SVE-NEXT: .LBB37_61: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB37_62: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB37_63: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB37_56 +; NO_SVE-NEXT: .LBB37_64: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: sshll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #27, .LBB37_57 +; NO_SVE-NEXT: .LBB37_65: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB37_58 +; NO_SVE-NEXT: .LBB37_66: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbz w8, #29, .LBB37_59 +; NO_SVE-NEXT: .LBB37_67: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB37_60 +; NO_SVE-NEXT: .LBB37_68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB37_61 +; NO_SVE-NEXT: b .LBB37_62 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -983,15 +7272,438 @@ } define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #2 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB38_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: b .LBB38_3 +; NO_SVE-NEXT: .LBB38_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB38_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #2 +; NO_SVE-NEXT: tbnz w8, #1, .LBB38_8 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB38_9 +; NO_SVE-NEXT: .LBB38_5: // %else5 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #2 +; NO_SVE-NEXT: tbnz w8, #3, .LBB38_10 +; NO_SVE-NEXT: .LBB38_6: // %else8 +; NO_SVE-NEXT: add v5.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB38_11 +; NO_SVE-NEXT: .LBB38_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_12 +; NO_SVE-NEXT: .LBB38_8: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB38_5 +; NO_SVE-NEXT: .LBB38_9: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #2 +; NO_SVE-NEXT: tbz w8, #3, .LBB38_6 +; NO_SVE-NEXT: .LBB38_10: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB38_7 +; NO_SVE-NEXT: .LBB38_11: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: .LBB38_12: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #2 +; NO_SVE-NEXT: tbnz w8, #5, .LBB38_17 +; NO_SVE-NEXT: // %bb.13: // %else14 +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB38_18 +; NO_SVE-NEXT: .LBB38_14: // %else17 +; NO_SVE-NEXT: sshll v5.2d, v4.2s, #2 +; NO_SVE-NEXT: tbnz w8, #7, .LBB38_19 +; NO_SVE-NEXT: .LBB38_15: // %else20 +; NO_SVE-NEXT: add v6.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB38_20 +; NO_SVE-NEXT: .LBB38_16: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_21 +; NO_SVE-NEXT: .LBB38_17: // %cond.load13 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB38_14 +; NO_SVE-NEXT: .LBB38_18: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: sshll v5.2d, v4.2s, #2 +; NO_SVE-NEXT: tbz w8, #7, .LBB38_15 +; NO_SVE-NEXT: .LBB38_19: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB38_16 +; NO_SVE-NEXT: .LBB38_20: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB38_21: // %else23 +; NO_SVE-NEXT: ldr q5, [x1, #48] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #2 +; NO_SVE-NEXT: tbnz w8, #9, .LBB38_26 +; NO_SVE-NEXT: // %bb.22: // %else26 +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB38_27 +; NO_SVE-NEXT: .LBB38_23: // %else29 +; NO_SVE-NEXT: sshll v6.2d, v5.2s, #2 +; NO_SVE-NEXT: tbnz w8, #11, .LBB38_28 +; NO_SVE-NEXT: .LBB38_24: // %else32 +; NO_SVE-NEXT: add v7.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB38_29 +; NO_SVE-NEXT: .LBB38_25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_30 +; NO_SVE-NEXT: .LBB38_26: // %cond.load25 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB38_23 +; NO_SVE-NEXT: .LBB38_27: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: sshll v6.2d, v5.2s, #2 +; NO_SVE-NEXT: tbz w8, #11, .LBB38_24 +; NO_SVE-NEXT: .LBB38_28: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB38_25 +; NO_SVE-NEXT: .LBB38_29: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB38_30: // %else35 +; NO_SVE-NEXT: ldr q6, [x1, #64] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #2 +; NO_SVE-NEXT: tbnz w8, #13, .LBB38_35 +; NO_SVE-NEXT: // %bb.31: // %else38 +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB38_36 +; NO_SVE-NEXT: .LBB38_32: // %else41 +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #2 +; NO_SVE-NEXT: tbnz w8, #15, .LBB38_37 +; NO_SVE-NEXT: .LBB38_33: // %else44 +; NO_SVE-NEXT: add v16.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB38_38 +; NO_SVE-NEXT: .LBB38_34: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_39 +; NO_SVE-NEXT: .LBB38_35: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB38_32 +; NO_SVE-NEXT: .LBB38_36: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #2 +; NO_SVE-NEXT: tbz w8, #15, .LBB38_33 +; NO_SVE-NEXT: .LBB38_37: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB38_34 +; NO_SVE-NEXT: .LBB38_38: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: .LBB38_39: // %else47 +; NO_SVE-NEXT: ldr q7, [x1, #80] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #2 +; NO_SVE-NEXT: tbnz w8, #17, .LBB38_44 +; NO_SVE-NEXT: // %bb.40: // %else50 +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB38_45 +; NO_SVE-NEXT: .LBB38_41: // %else53 +; NO_SVE-NEXT: sshll v16.2d, v7.2s, #2 +; NO_SVE-NEXT: tbnz w8, #19, .LBB38_46 +; NO_SVE-NEXT: .LBB38_42: // %else56 +; NO_SVE-NEXT: add v17.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB38_47 +; NO_SVE-NEXT: .LBB38_43: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_48 +; NO_SVE-NEXT: .LBB38_44: // %cond.load49 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB38_41 +; NO_SVE-NEXT: .LBB38_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: sshll v16.2d, v7.2s, #2 +; NO_SVE-NEXT: tbz w8, #19, .LBB38_42 +; NO_SVE-NEXT: .LBB38_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB38_43 +; NO_SVE-NEXT: .LBB38_47: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: .LBB38_48: // %else59 +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: sshll2 v7.2d, v7.4s, #2 +; NO_SVE-NEXT: tbnz w8, #21, .LBB38_53 +; NO_SVE-NEXT: // %bb.49: // %else62 +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB38_54 +; NO_SVE-NEXT: .LBB38_50: // %else65 +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #2 +; NO_SVE-NEXT: tbnz w8, #23, .LBB38_55 +; NO_SVE-NEXT: .LBB38_51: // %else68 +; NO_SVE-NEXT: add v18.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB38_56 +; NO_SVE-NEXT: .LBB38_52: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_57 +; NO_SVE-NEXT: .LBB38_53: // %cond.load61 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB38_50 +; NO_SVE-NEXT: .LBB38_54: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #2 +; NO_SVE-NEXT: tbz w8, #23, .LBB38_51 +; NO_SVE-NEXT: .LBB38_55: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v18.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB38_52 +; NO_SVE-NEXT: .LBB38_56: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: .LBB38_57: // %else71 +; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: sshll2 v16.2d, v16.4s, #2 +; NO_SVE-NEXT: tbnz w8, #25, .LBB38_62 +; NO_SVE-NEXT: // %bb.58: // %else74 +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB38_63 +; NO_SVE-NEXT: .LBB38_59: // %else77 +; NO_SVE-NEXT: sshll v18.2d, v17.2s, #2 +; NO_SVE-NEXT: tbnz w8, #27, .LBB38_64 +; NO_SVE-NEXT: .LBB38_60: // %else80 +; NO_SVE-NEXT: add v18.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB38_65 +; NO_SVE-NEXT: .LBB38_61: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: sshll2 v17.2d, v17.4s, #2 +; NO_SVE-NEXT: tbnz w8, #29, .LBB38_66 +; NO_SVE-NEXT: b .LBB38_67 +; NO_SVE-NEXT: .LBB38_62: // %cond.load73 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB38_59 +; NO_SVE-NEXT: .LBB38_63: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: sshll v18.2d, v17.2s, #2 +; NO_SVE-NEXT: tbz w8, #27, .LBB38_60 +; NO_SVE-NEXT: .LBB38_64: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v18.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB38_61 +; NO_SVE-NEXT: .LBB38_65: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: sshll2 v17.2d, v17.4s, #2 +; NO_SVE-NEXT: tbz w8, #29, .LBB38_67 +; NO_SVE-NEXT: .LBB38_66: // %cond.load85 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB38_67: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB38_71 +; NO_SVE-NEXT: // %bb.68: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB38_70 +; NO_SVE-NEXT: .LBB38_69: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB38_70: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB38_71: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB38_69 +; NO_SVE-NEXT: b .LBB38_70 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1003,6 +7715,472 @@ } define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #160] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: ldp q5, q6, [x0, #128] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: ldp q0, q2, [x0, #224] +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: uzp1 v5.4s, v5.4s, v6.4s +; NO_SVE-NEXT: ldp q7, q16, [x0, #192] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v5.8h, v3.8h +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: fcmeq v16.2d, v16.2d, #0.0 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: ldp q17, q18, [x0] +; NO_SVE-NEXT: uzp1 v2.4s, v7.4s, v16.4s +; NO_SVE-NEXT: fcmeq v7.2d, v17.2d, #0.0 +; NO_SVE-NEXT: ldp q19, q4, [x0, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: fcmeq v6.2d, v18.2d, #0.0 +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: fcmeq v5.2d, v19.2d, #0.0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: uzp1 v4.4s, v7.4s, v6.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v3.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #10 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w9, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: umov w10, v2.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w11, v2.b[0] +; NO_SVE-NEXT: umov w13, v2.b[3] +; NO_SVE-NEXT: uzp1 v5.4s, v5.4s, v6.4s +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v5.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: dup v5.2d, x2 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #3 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v2.2d, v5.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB39_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: b .LBB39_3 +; NO_SVE-NEXT: .LBB39_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB39_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #3 +; NO_SVE-NEXT: tbz w8, #1, .LBB39_5 +; NO_SVE-NEXT: // %bb.4: // %cond.load1 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_5: // %else2 +; NO_SVE-NEXT: add v2.2d, v5.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB39_7 +; NO_SVE-NEXT: // %bb.6: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #3 +; NO_SVE-NEXT: tbnz w8, #3, .LBB39_8 +; NO_SVE-NEXT: b .LBB39_9 +; NO_SVE-NEXT: .LBB39_7: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #3 +; NO_SVE-NEXT: tbz w8, #3, .LBB39_9 +; NO_SVE-NEXT: .LBB39_8: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_9: // %else8 +; NO_SVE-NEXT: add v4.2d, v5.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB39_11 +; NO_SVE-NEXT: // %bb.10: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_12 +; NO_SVE-NEXT: .LBB39_11: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB39_12: // %else11 +; NO_SVE-NEXT: ldr q6, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #3 +; NO_SVE-NEXT: tbz w8, #5, .LBB39_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_14: // %else14 +; NO_SVE-NEXT: add v4.2d, v5.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB39_16 +; NO_SVE-NEXT: // %bb.15: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #3 +; NO_SVE-NEXT: tbnz w8, #7, .LBB39_17 +; NO_SVE-NEXT: b .LBB39_18 +; NO_SVE-NEXT: .LBB39_16: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #3 +; NO_SVE-NEXT: tbz w8, #7, .LBB39_18 +; NO_SVE-NEXT: .LBB39_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_18: // %else20 +; NO_SVE-NEXT: add v7.2d, v5.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB39_20 +; NO_SVE-NEXT: // %bb.19: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_21 +; NO_SVE-NEXT: .LBB39_20: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB39_21: // %else23 +; NO_SVE-NEXT: ldr q16, [x1, #48] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #3 +; NO_SVE-NEXT: tbz w8, #9, .LBB39_23 +; NO_SVE-NEXT: // %bb.22: // %cond.load25 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_23: // %else26 +; NO_SVE-NEXT: add v7.2d, v5.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB39_25 +; NO_SVE-NEXT: // %bb.24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #3 +; NO_SVE-NEXT: tbnz w8, #11, .LBB39_26 +; NO_SVE-NEXT: b .LBB39_27 +; NO_SVE-NEXT: .LBB39_25: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #3 +; NO_SVE-NEXT: tbz w8, #11, .LBB39_27 +; NO_SVE-NEXT: .LBB39_26: // %cond.load31 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_27: // %else32 +; NO_SVE-NEXT: add v17.2d, v5.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB39_29 +; NO_SVE-NEXT: // %bb.28: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_30 +; NO_SVE-NEXT: .LBB39_29: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: .LBB39_30: // %else35 +; NO_SVE-NEXT: ldr q18, [x1, #64] +; NO_SVE-NEXT: sshll2 v16.2d, v16.4s, #3 +; NO_SVE-NEXT: tbz w8, #13, .LBB39_32 +; NO_SVE-NEXT: // %bb.31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_32: // %else38 +; NO_SVE-NEXT: add v17.2d, v5.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB39_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: sshll v19.2d, v18.2s, #3 +; NO_SVE-NEXT: tbnz w8, #15, .LBB39_35 +; NO_SVE-NEXT: b .LBB39_36 +; NO_SVE-NEXT: .LBB39_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: sshll v19.2d, v18.2s, #3 +; NO_SVE-NEXT: tbz w8, #15, .LBB39_36 +; NO_SVE-NEXT: .LBB39_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_36: // %else44 +; NO_SVE-NEXT: add v19.2d, v5.2d, v19.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB39_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_39 +; NO_SVE-NEXT: .LBB39_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: .LBB39_39: // %else47 +; NO_SVE-NEXT: ldr q20, [x1, #80] +; NO_SVE-NEXT: sshll2 v18.2d, v18.4s, #3 +; NO_SVE-NEXT: tbz w8, #17, .LBB39_41 +; NO_SVE-NEXT: // %bb.40: // %cond.load49 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_41: // %else50 +; NO_SVE-NEXT: add v19.2d, v5.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB39_43 +; NO_SVE-NEXT: // %bb.42: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: sshll v21.2d, v20.2s, #3 +; NO_SVE-NEXT: tbnz w8, #19, .LBB39_44 +; NO_SVE-NEXT: b .LBB39_45 +; NO_SVE-NEXT: .LBB39_43: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: sshll v21.2d, v20.2s, #3 +; NO_SVE-NEXT: tbz w8, #19, .LBB39_45 +; NO_SVE-NEXT: .LBB39_44: // %cond.load55 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_45: // %else56 +; NO_SVE-NEXT: add v21.2d, v5.2d, v21.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB39_47 +; NO_SVE-NEXT: // %bb.46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_48 +; NO_SVE-NEXT: .LBB39_47: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: .LBB39_48: // %else59 +; NO_SVE-NEXT: ldr q22, [x1, #96] +; NO_SVE-NEXT: sshll2 v20.2d, v20.4s, #3 +; NO_SVE-NEXT: tbz w8, #21, .LBB39_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load61 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_50: // %else62 +; NO_SVE-NEXT: add v21.2d, v5.2d, v20.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB39_52 +; NO_SVE-NEXT: // %bb.51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: sshll v23.2d, v22.2s, #3 +; NO_SVE-NEXT: tbnz w8, #23, .LBB39_53 +; NO_SVE-NEXT: b .LBB39_54 +; NO_SVE-NEXT: .LBB39_52: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: sshll v23.2d, v22.2s, #3 +; NO_SVE-NEXT: tbz w8, #23, .LBB39_54 +; NO_SVE-NEXT: .LBB39_53: // %cond.load67 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_54: // %else68 +; NO_SVE-NEXT: add v23.2d, v5.2d, v23.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB39_56 +; NO_SVE-NEXT: // %bb.55: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_57 +; NO_SVE-NEXT: .LBB39_56: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: .LBB39_57: // %else71 +; NO_SVE-NEXT: ldr q24, [x1, #112] +; NO_SVE-NEXT: sshll2 v22.2d, v22.4s, #3 +; NO_SVE-NEXT: tbz w8, #25, .LBB39_59 +; NO_SVE-NEXT: // %bb.58: // %cond.load73 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_59: // %else74 +; NO_SVE-NEXT: add v23.2d, v5.2d, v22.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB39_61 +; NO_SVE-NEXT: // %bb.60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: sshll v25.2d, v24.2s, #3 +; NO_SVE-NEXT: tbnz w8, #27, .LBB39_62 +; NO_SVE-NEXT: b .LBB39_63 +; NO_SVE-NEXT: .LBB39_61: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: sshll v25.2d, v24.2s, #3 +; NO_SVE-NEXT: tbz w8, #27, .LBB39_63 +; NO_SVE-NEXT: .LBB39_62: // %cond.load79 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_63: // %else80 +; NO_SVE-NEXT: add v25.2d, v5.2d, v25.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB39_65 +; NO_SVE-NEXT: // %bb.64: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d25 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: sshll2 v24.2d, v24.4s, #3 +; NO_SVE-NEXT: tbnz w8, #29, .LBB39_66 +; NO_SVE-NEXT: b .LBB39_67 +; NO_SVE-NEXT: .LBB39_65: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: sshll2 v24.2d, v24.4s, #3 +; NO_SVE-NEXT: tbz w8, #29, .LBB39_67 +; NO_SVE-NEXT: .LBB39_66: // %cond.load85 +; NO_SVE-NEXT: mov x9, v25.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_67: // %else86 +; NO_SVE-NEXT: add v24.2d, v5.2d, v24.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB39_69 +; NO_SVE-NEXT: // %bb.68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB39_70 +; NO_SVE-NEXT: b .LBB39_71 +; NO_SVE-NEXT: .LBB39_69: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #31, .LBB39_71 +; NO_SVE-NEXT: .LBB39_70: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x8] +; NO_SVE-NEXT: .LBB39_71: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: stp q17, q18, [x0, #128] +; NO_SVE-NEXT: stp q19, q20, [x0, #160] +; NO_SVE-NEXT: stp q21, q22, [x0, #192] +; NO_SVE-NEXT: stp q23, q5, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z18.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p1/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z19.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z5.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z22.d, z18.s +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: sunpklo z21.d, z17.s +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z20.d, z16.s +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z17.d, z17.s +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z16.d, z16.s +; VBITS_EQ_256-NEXT: sunpklo z23.d, z19.s +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z19.d, z19.s +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z7.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1023,18 +8201,436 @@ } define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_zext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: ushll v0.2d, v2.2s, #1 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v1.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB40_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: b .LBB40_3 +; NO_SVE-NEXT: .LBB40_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB40_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: ushll2 v2.2d, v2.4s, #1 +; NO_SVE-NEXT: tbnz w8, #1, .LBB40_13 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB40_14 +; NO_SVE-NEXT: .LBB40_5: // %else5 +; NO_SVE-NEXT: ushll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #3, .LBB40_15 +; NO_SVE-NEXT: .LBB40_6: // %else8 +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB40_8 +; NO_SVE-NEXT: .LBB40_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_8: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: ushll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #5, .LBB40_16 +; NO_SVE-NEXT: // %bb.9: // %else14 +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB40_17 +; NO_SVE-NEXT: .LBB40_10: // %else17 +; NO_SVE-NEXT: ushll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #7, .LBB40_18 +; NO_SVE-NEXT: .LBB40_11: // %else20 +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB40_19 +; NO_SVE-NEXT: .LBB40_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_20 +; NO_SVE-NEXT: .LBB40_13: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB40_5 +; NO_SVE-NEXT: .LBB40_14: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: ushll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #3, .LBB40_6 +; NO_SVE-NEXT: .LBB40_15: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB40_7 +; NO_SVE-NEXT: b .LBB40_8 +; NO_SVE-NEXT: .LBB40_16: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB40_10 +; NO_SVE-NEXT: .LBB40_17: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: ushll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #7, .LBB40_11 +; NO_SVE-NEXT: .LBB40_18: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB40_12 +; NO_SVE-NEXT: .LBB40_19: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB40_20: // %else23 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: ushll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #9, .LBB40_30 +; NO_SVE-NEXT: // %bb.21: // %else26 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB40_31 +; NO_SVE-NEXT: .LBB40_22: // %else29 +; NO_SVE-NEXT: ushll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #11, .LBB40_32 +; NO_SVE-NEXT: .LBB40_23: // %else32 +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB40_25 +; NO_SVE-NEXT: .LBB40_24: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_25: // %else35 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: ushll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #13, .LBB40_33 +; NO_SVE-NEXT: // %bb.26: // %else38 +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB40_34 +; NO_SVE-NEXT: .LBB40_27: // %else41 +; NO_SVE-NEXT: ushll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #15, .LBB40_35 +; NO_SVE-NEXT: .LBB40_28: // %else44 +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB40_36 +; NO_SVE-NEXT: .LBB40_29: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_37 +; NO_SVE-NEXT: .LBB40_30: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB40_22 +; NO_SVE-NEXT: .LBB40_31: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: ushll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #11, .LBB40_23 +; NO_SVE-NEXT: .LBB40_32: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB40_24 +; NO_SVE-NEXT: b .LBB40_25 +; NO_SVE-NEXT: .LBB40_33: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB40_27 +; NO_SVE-NEXT: .LBB40_34: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: ushll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #15, .LBB40_28 +; NO_SVE-NEXT: .LBB40_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB40_29 +; NO_SVE-NEXT: .LBB40_36: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB40_37: // %else47 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #17, .LBB40_47 +; NO_SVE-NEXT: // %bb.38: // %else50 +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB40_48 +; NO_SVE-NEXT: .LBB40_39: // %else53 +; NO_SVE-NEXT: ushll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #19, .LBB40_49 +; NO_SVE-NEXT: .LBB40_40: // %else56 +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB40_42 +; NO_SVE-NEXT: .LBB40_41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_42: // %else59 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: ushll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #21, .LBB40_50 +; NO_SVE-NEXT: // %bb.43: // %else62 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB40_51 +; NO_SVE-NEXT: .LBB40_44: // %else65 +; NO_SVE-NEXT: ushll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbnz w8, #23, .LBB40_52 +; NO_SVE-NEXT: .LBB40_45: // %else68 +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB40_53 +; NO_SVE-NEXT: .LBB40_46: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_54 +; NO_SVE-NEXT: .LBB40_47: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB40_39 +; NO_SVE-NEXT: .LBB40_48: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: ushll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #19, .LBB40_40 +; NO_SVE-NEXT: .LBB40_49: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB40_41 +; NO_SVE-NEXT: b .LBB40_42 +; NO_SVE-NEXT: .LBB40_50: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB40_44 +; NO_SVE-NEXT: .LBB40_51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: ushll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbz w8, #23, .LBB40_45 +; NO_SVE-NEXT: .LBB40_52: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB40_46 +; NO_SVE-NEXT: .LBB40_53: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB40_54: // %else71 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: ushll2 v6.2d, v6.4s, #1 +; NO_SVE-NEXT: tbnz w8, #25, .LBB40_63 +; NO_SVE-NEXT: // %bb.55: // %else74 +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB40_64 +; NO_SVE-NEXT: .LBB40_56: // %else77 +; NO_SVE-NEXT: ushll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #27, .LBB40_65 +; NO_SVE-NEXT: .LBB40_57: // %else80 +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB40_66 +; NO_SVE-NEXT: .LBB40_58: // %else83 +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #29, .LBB40_67 +; NO_SVE-NEXT: .LBB40_59: // %else86 +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB40_68 +; NO_SVE-NEXT: .LBB40_60: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB40_62 +; NO_SVE-NEXT: .LBB40_61: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB40_62: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB40_63: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB40_56 +; NO_SVE-NEXT: .LBB40_64: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: ushll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #27, .LBB40_57 +; NO_SVE-NEXT: .LBB40_65: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB40_58 +; NO_SVE-NEXT: .LBB40_66: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbz w8, #29, .LBB40_59 +; NO_SVE-NEXT: .LBB40_67: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB40_60 +; NO_SVE-NEXT: .LBB40_68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB40_61 +; NO_SVE-NEXT: b .LBB40_62 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_zext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1046,18 +8642,401 @@ } define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_unscaled_sext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v2.2s +; NO_SVE-NEXT: tbz w8, #0, .LBB41_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB41_3 +; NO_SVE-NEXT: b .LBB41_4 +; NO_SVE-NEXT: .LBB41_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB41_4 +; NO_SVE-NEXT: .LBB41_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: saddw2 v2.2d, v1.2d, v2.4s +; NO_SVE-NEXT: tbnz w8, #2, .LBB41_13 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB41_14 +; NO_SVE-NEXT: .LBB41_6: // %else8 +; NO_SVE-NEXT: saddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #4, .LBB41_15 +; NO_SVE-NEXT: .LBB41_7: // %else11 +; NO_SVE-NEXT: tbz w8, #5, .LBB41_9 +; NO_SVE-NEXT: .LBB41_8: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_9: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: saddw2 v2.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #6, .LBB41_16 +; NO_SVE-NEXT: // %bb.10: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB41_17 +; NO_SVE-NEXT: .LBB41_11: // %else20 +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #8, .LBB41_18 +; NO_SVE-NEXT: .LBB41_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB41_19 +; NO_SVE-NEXT: b .LBB41_20 +; NO_SVE-NEXT: .LBB41_13: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB41_6 +; NO_SVE-NEXT: .LBB41_14: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: saddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #4, .LBB41_7 +; NO_SVE-NEXT: .LBB41_15: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB41_8 +; NO_SVE-NEXT: b .LBB41_9 +; NO_SVE-NEXT: .LBB41_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB41_11 +; NO_SVE-NEXT: .LBB41_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #8, .LBB41_12 +; NO_SVE-NEXT: .LBB41_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB41_20 +; NO_SVE-NEXT: .LBB41_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: saddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #10, .LBB41_29 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB41_30 +; NO_SVE-NEXT: .LBB41_22: // %else32 +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #12, .LBB41_31 +; NO_SVE-NEXT: .LBB41_23: // %else35 +; NO_SVE-NEXT: tbz w8, #13, .LBB41_25 +; NO_SVE-NEXT: .LBB41_24: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_25: // %else38 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: saddw2 v3.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #14, .LBB41_32 +; NO_SVE-NEXT: // %bb.26: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB41_33 +; NO_SVE-NEXT: .LBB41_27: // %else44 +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #16, .LBB41_34 +; NO_SVE-NEXT: .LBB41_28: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB41_35 +; NO_SVE-NEXT: b .LBB41_36 +; NO_SVE-NEXT: .LBB41_29: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB41_22 +; NO_SVE-NEXT: .LBB41_30: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #12, .LBB41_23 +; NO_SVE-NEXT: .LBB41_31: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB41_24 +; NO_SVE-NEXT: b .LBB41_25 +; NO_SVE-NEXT: .LBB41_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB41_27 +; NO_SVE-NEXT: .LBB41_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #16, .LBB41_28 +; NO_SVE-NEXT: .LBB41_34: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #17, .LBB41_36 +; NO_SVE-NEXT: .LBB41_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_36: // %else50 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: saddw2 v5.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #18, .LBB41_45 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB41_46 +; NO_SVE-NEXT: .LBB41_38: // %else56 +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #20, .LBB41_47 +; NO_SVE-NEXT: .LBB41_39: // %else59 +; NO_SVE-NEXT: tbz w8, #21, .LBB41_41 +; NO_SVE-NEXT: .LBB41_40: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_41: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: saddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #22, .LBB41_48 +; NO_SVE-NEXT: // %bb.42: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB41_49 +; NO_SVE-NEXT: .LBB41_43: // %else68 +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbz w8, #24, .LBB41_50 +; NO_SVE-NEXT: .LBB41_44: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB41_51 +; NO_SVE-NEXT: b .LBB41_52 +; NO_SVE-NEXT: .LBB41_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB41_38 +; NO_SVE-NEXT: .LBB41_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #20, .LBB41_39 +; NO_SVE-NEXT: .LBB41_47: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB41_40 +; NO_SVE-NEXT: b .LBB41_41 +; NO_SVE-NEXT: .LBB41_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB41_43 +; NO_SVE-NEXT: .LBB41_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbnz w8, #24, .LBB41_44 +; NO_SVE-NEXT: .LBB41_50: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #25, .LBB41_52 +; NO_SVE-NEXT: .LBB41_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_52: // %else74 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: saddw2 v6.2d, v1.2d, v6.4s +; NO_SVE-NEXT: tbnz w8, #26, .LBB41_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB41_61 +; NO_SVE-NEXT: .LBB41_54: // %else80 +; NO_SVE-NEXT: saddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #28, .LBB41_62 +; NO_SVE-NEXT: .LBB41_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB41_63 +; NO_SVE-NEXT: .LBB41_56: // %else86 +; NO_SVE-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #30, .LBB41_64 +; NO_SVE-NEXT: .LBB41_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB41_59 +; NO_SVE-NEXT: .LBB41_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB41_59: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB41_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB41_54 +; NO_SVE-NEXT: .LBB41_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: saddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #28, .LBB41_55 +; NO_SVE-NEXT: .LBB41_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB41_56 +; NO_SVE-NEXT: .LBB41_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbz w8, #30, .LBB41_57 +; NO_SVE-NEXT: .LBB41_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB41_58 +; NO_SVE-NEXT: b .LBB41_59 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1070,18 +9049,401 @@ } define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_unscaled_zext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v2.2s +; NO_SVE-NEXT: tbz w8, #0, .LBB42_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB42_3 +; NO_SVE-NEXT: b .LBB42_4 +; NO_SVE-NEXT: .LBB42_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB42_4 +; NO_SVE-NEXT: .LBB42_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: uaddw2 v2.2d, v1.2d, v2.4s +; NO_SVE-NEXT: tbnz w8, #2, .LBB42_13 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB42_14 +; NO_SVE-NEXT: .LBB42_6: // %else8 +; NO_SVE-NEXT: uaddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #4, .LBB42_15 +; NO_SVE-NEXT: .LBB42_7: // %else11 +; NO_SVE-NEXT: tbz w8, #5, .LBB42_9 +; NO_SVE-NEXT: .LBB42_8: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_9: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: uaddw2 v2.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #6, .LBB42_16 +; NO_SVE-NEXT: // %bb.10: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB42_17 +; NO_SVE-NEXT: .LBB42_11: // %else20 +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #8, .LBB42_18 +; NO_SVE-NEXT: .LBB42_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB42_19 +; NO_SVE-NEXT: b .LBB42_20 +; NO_SVE-NEXT: .LBB42_13: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB42_6 +; NO_SVE-NEXT: .LBB42_14: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: uaddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #4, .LBB42_7 +; NO_SVE-NEXT: .LBB42_15: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB42_8 +; NO_SVE-NEXT: b .LBB42_9 +; NO_SVE-NEXT: .LBB42_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB42_11 +; NO_SVE-NEXT: .LBB42_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #8, .LBB42_12 +; NO_SVE-NEXT: .LBB42_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB42_20 +; NO_SVE-NEXT: .LBB42_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: uaddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #10, .LBB42_29 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB42_30 +; NO_SVE-NEXT: .LBB42_22: // %else32 +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #12, .LBB42_31 +; NO_SVE-NEXT: .LBB42_23: // %else35 +; NO_SVE-NEXT: tbz w8, #13, .LBB42_25 +; NO_SVE-NEXT: .LBB42_24: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_25: // %else38 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: uaddw2 v3.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #14, .LBB42_32 +; NO_SVE-NEXT: // %bb.26: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB42_33 +; NO_SVE-NEXT: .LBB42_27: // %else44 +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #16, .LBB42_34 +; NO_SVE-NEXT: .LBB42_28: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB42_35 +; NO_SVE-NEXT: b .LBB42_36 +; NO_SVE-NEXT: .LBB42_29: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB42_22 +; NO_SVE-NEXT: .LBB42_30: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #12, .LBB42_23 +; NO_SVE-NEXT: .LBB42_31: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB42_24 +; NO_SVE-NEXT: b .LBB42_25 +; NO_SVE-NEXT: .LBB42_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB42_27 +; NO_SVE-NEXT: .LBB42_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #16, .LBB42_28 +; NO_SVE-NEXT: .LBB42_34: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #17, .LBB42_36 +; NO_SVE-NEXT: .LBB42_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_36: // %else50 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: uaddw2 v5.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #18, .LBB42_45 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB42_46 +; NO_SVE-NEXT: .LBB42_38: // %else56 +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #20, .LBB42_47 +; NO_SVE-NEXT: .LBB42_39: // %else59 +; NO_SVE-NEXT: tbz w8, #21, .LBB42_41 +; NO_SVE-NEXT: .LBB42_40: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_41: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: uaddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #22, .LBB42_48 +; NO_SVE-NEXT: // %bb.42: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB42_49 +; NO_SVE-NEXT: .LBB42_43: // %else68 +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbz w8, #24, .LBB42_50 +; NO_SVE-NEXT: .LBB42_44: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB42_51 +; NO_SVE-NEXT: b .LBB42_52 +; NO_SVE-NEXT: .LBB42_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB42_38 +; NO_SVE-NEXT: .LBB42_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #20, .LBB42_39 +; NO_SVE-NEXT: .LBB42_47: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB42_40 +; NO_SVE-NEXT: b .LBB42_41 +; NO_SVE-NEXT: .LBB42_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB42_43 +; NO_SVE-NEXT: .LBB42_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbnz w8, #24, .LBB42_44 +; NO_SVE-NEXT: .LBB42_50: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #25, .LBB42_52 +; NO_SVE-NEXT: .LBB42_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_52: // %else74 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: uaddw2 v6.2d, v1.2d, v6.4s +; NO_SVE-NEXT: tbnz w8, #26, .LBB42_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB42_61 +; NO_SVE-NEXT: .LBB42_54: // %else80 +; NO_SVE-NEXT: uaddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #28, .LBB42_62 +; NO_SVE-NEXT: .LBB42_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB42_63 +; NO_SVE-NEXT: .LBB42_56: // %else86 +; NO_SVE-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #30, .LBB42_64 +; NO_SVE-NEXT: .LBB42_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB42_59 +; NO_SVE-NEXT: .LBB42_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB42_59: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB42_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB42_54 +; NO_SVE-NEXT: .LBB42_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: uaddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #28, .LBB42_55 +; NO_SVE-NEXT: .LBB42_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB42_56 +; NO_SVE-NEXT: .LBB42_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbz w8, #30, .LBB42_57 +; NO_SVE-NEXT: .LBB42_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB42_58 +; NO_SVE-NEXT: b .LBB42_59 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1094,6 +9456,452 @@ } define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 { +; NO_SVE-LABEL: masked_gather_64b_scaled: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: shl v0.2d, v0.2d, #2 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v3.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB43_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: shl v1.2d, v1.2d, #2 +; NO_SVE-NEXT: tbnz w8, #1, .LBB43_3 +; NO_SVE-NEXT: b .LBB43_4 +; NO_SVE-NEXT: .LBB43_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: shl v1.2d, v1.2d, #2 +; NO_SVE-NEXT: tbz w8, #1, .LBB43_4 +; NO_SVE-NEXT: .LBB43_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB43_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_6: // %else5 +; NO_SVE-NEXT: shl v4.2d, v3.2d, #2 +; NO_SVE-NEXT: tbz w8, #3, .LBB43_8 +; NO_SVE-NEXT: // %bb.7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB43_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: shl v3.2d, v3.2d, #2 +; NO_SVE-NEXT: tbnz w8, #5, .LBB43_11 +; NO_SVE-NEXT: b .LBB43_12 +; NO_SVE-NEXT: .LBB43_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: shl v3.2d, v3.2d, #2 +; NO_SVE-NEXT: tbz w8, #5, .LBB43_12 +; NO_SVE-NEXT: .LBB43_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB43_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_14: // %else17 +; NO_SVE-NEXT: shl v4.2d, v4.2d, #2 +; NO_SVE-NEXT: tbz w8, #7, .LBB43_16 +; NO_SVE-NEXT: // %bb.15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB43_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: shl v6.2d, v5.2d, #2 +; NO_SVE-NEXT: tbnz w8, #9, .LBB43_19 +; NO_SVE-NEXT: b .LBB43_20 +; NO_SVE-NEXT: .LBB43_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: shl v6.2d, v5.2d, #2 +; NO_SVE-NEXT: tbz w8, #9, .LBB43_20 +; NO_SVE-NEXT: .LBB43_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_20: // %else26 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: add v4.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB43_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_22: // %else29 +; NO_SVE-NEXT: shl v5.2d, v5.2d, #2 +; NO_SVE-NEXT: tbz w8, #11, .LBB43_24 +; NO_SVE-NEXT: // %bb.23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_24: // %else32 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB43_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: shl v7.2d, v6.2d, #2 +; NO_SVE-NEXT: tbnz w8, #13, .LBB43_27 +; NO_SVE-NEXT: b .LBB43_28 +; NO_SVE-NEXT: .LBB43_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: shl v7.2d, v6.2d, #2 +; NO_SVE-NEXT: tbz w8, #13, .LBB43_28 +; NO_SVE-NEXT: .LBB43_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_28: // %else38 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: add v5.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB43_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_30: // %else41 +; NO_SVE-NEXT: shl v6.2d, v6.2d, #2 +; NO_SVE-NEXT: tbz w8, #15, .LBB43_32 +; NO_SVE-NEXT: // %bb.31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_32: // %else44 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB43_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: shl v16.2d, v7.2d, #2 +; NO_SVE-NEXT: tbnz w8, #17, .LBB43_35 +; NO_SVE-NEXT: b .LBB43_36 +; NO_SVE-NEXT: .LBB43_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: shl v16.2d, v7.2d, #2 +; NO_SVE-NEXT: tbz w8, #17, .LBB43_36 +; NO_SVE-NEXT: .LBB43_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_36: // %else50 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: add v6.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB43_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_38: // %else53 +; NO_SVE-NEXT: shl v7.2d, v7.2d, #2 +; NO_SVE-NEXT: tbz w8, #19, .LBB43_40 +; NO_SVE-NEXT: // %bb.39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_40: // %else56 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB43_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: shl v17.2d, v16.2d, #2 +; NO_SVE-NEXT: tbnz w8, #21, .LBB43_43 +; NO_SVE-NEXT: b .LBB43_44 +; NO_SVE-NEXT: .LBB43_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: shl v17.2d, v16.2d, #2 +; NO_SVE-NEXT: tbz w8, #21, .LBB43_44 +; NO_SVE-NEXT: .LBB43_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_44: // %else62 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: add v7.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB43_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_46: // %else65 +; NO_SVE-NEXT: shl v16.2d, v16.2d, #2 +; NO_SVE-NEXT: tbz w8, #23, .LBB43_48 +; NO_SVE-NEXT: // %bb.47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_48: // %else68 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB43_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: shl v18.2d, v17.2d, #2 +; NO_SVE-NEXT: tbnz w8, #25, .LBB43_51 +; NO_SVE-NEXT: b .LBB43_52 +; NO_SVE-NEXT: .LBB43_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: shl v18.2d, v17.2d, #2 +; NO_SVE-NEXT: tbz w8, #25, .LBB43_52 +; NO_SVE-NEXT: .LBB43_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_52: // %else74 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: add v16.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB43_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_54: // %else77 +; NO_SVE-NEXT: shl v17.2d, v17.2d, #2 +; NO_SVE-NEXT: tbz w8, #27, .LBB43_56 +; NO_SVE-NEXT: // %bb.55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_56: // %else80 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB43_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: shl v18.2d, v18.2d, #2 +; NO_SVE-NEXT: tbnz w8, #29, .LBB43_59 +; NO_SVE-NEXT: b .LBB43_60 +; NO_SVE-NEXT: .LBB43_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: shl v18.2d, v18.2d, #2 +; NO_SVE-NEXT: tbz w8, #29, .LBB43_60 +; NO_SVE-NEXT: .LBB43_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB43_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB43_63 +; NO_SVE-NEXT: .LBB43_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB43_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB43_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB43_62 +; NO_SVE-NEXT: b .LBB43_63 +; +; VBITS_EQ_256-LABEL: masked_gather_64b_scaled: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1116,6 +9924,464 @@ } define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 { +; NO_SVE-LABEL: masked_gather_64b_unscaled: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v1.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB44_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB44_3 +; NO_SVE-NEXT: b .LBB44_4 +; NO_SVE-NEXT: .LBB44_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB44_4 +; NO_SVE-NEXT: .LBB44_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB44_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB44_9 +; NO_SVE-NEXT: .LBB44_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB44_10 +; NO_SVE-NEXT: .LBB44_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB44_11 +; NO_SVE-NEXT: b .LBB44_12 +; NO_SVE-NEXT: .LBB44_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB44_6 +; NO_SVE-NEXT: .LBB44_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB44_7 +; NO_SVE-NEXT: .LBB44_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB44_12 +; NO_SVE-NEXT: .LBB44_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB44_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB44_17 +; NO_SVE-NEXT: .LBB44_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB44_18 +; NO_SVE-NEXT: .LBB44_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB44_19 +; NO_SVE-NEXT: b .LBB44_20 +; NO_SVE-NEXT: .LBB44_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB44_14 +; NO_SVE-NEXT: .LBB44_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB44_15 +; NO_SVE-NEXT: .LBB44_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB44_20 +; NO_SVE-NEXT: .LBB44_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB44_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB44_25 +; NO_SVE-NEXT: .LBB44_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB44_26 +; NO_SVE-NEXT: .LBB44_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB44_27 +; NO_SVE-NEXT: b .LBB44_28 +; NO_SVE-NEXT: .LBB44_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB44_22 +; NO_SVE-NEXT: .LBB44_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB44_23 +; NO_SVE-NEXT: .LBB44_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB44_28 +; NO_SVE-NEXT: .LBB44_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB44_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB44_33 +; NO_SVE-NEXT: .LBB44_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB44_34 +; NO_SVE-NEXT: .LBB44_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB44_35 +; NO_SVE-NEXT: b .LBB44_36 +; NO_SVE-NEXT: .LBB44_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB44_30 +; NO_SVE-NEXT: .LBB44_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB44_31 +; NO_SVE-NEXT: .LBB44_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB44_36 +; NO_SVE-NEXT: .LBB44_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB44_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB44_41 +; NO_SVE-NEXT: .LBB44_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB44_42 +; NO_SVE-NEXT: .LBB44_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB44_43 +; NO_SVE-NEXT: b .LBB44_44 +; NO_SVE-NEXT: .LBB44_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB44_38 +; NO_SVE-NEXT: .LBB44_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB44_39 +; NO_SVE-NEXT: .LBB44_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB44_44 +; NO_SVE-NEXT: .LBB44_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB44_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB44_49 +; NO_SVE-NEXT: .LBB44_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB44_50 +; NO_SVE-NEXT: .LBB44_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB44_51 +; NO_SVE-NEXT: b .LBB44_52 +; NO_SVE-NEXT: .LBB44_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB44_46 +; NO_SVE-NEXT: .LBB44_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB44_47 +; NO_SVE-NEXT: .LBB44_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB44_52 +; NO_SVE-NEXT: .LBB44_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB44_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB44_57 +; NO_SVE-NEXT: .LBB44_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB44_58 +; NO_SVE-NEXT: .LBB44_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB44_59 +; NO_SVE-NEXT: b .LBB44_60 +; NO_SVE-NEXT: .LBB44_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB44_54 +; NO_SVE-NEXT: .LBB44_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB44_55 +; NO_SVE-NEXT: .LBB44_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB44_60 +; NO_SVE-NEXT: .LBB44_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB44_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB44_63 +; NO_SVE-NEXT: .LBB44_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB44_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB44_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB44_62 +; NO_SVE-NEXT: b .LBB44_63 +; +; VBITS_EQ_256-LABEL: masked_gather_64b_unscaled: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1139,6 +10405,464 @@ } define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 { +; NO_SVE-LABEL: masked_gather_vec_plus_reg: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v1.2d, v0.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB45_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB45_3 +; NO_SVE-NEXT: b .LBB45_4 +; NO_SVE-NEXT: .LBB45_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB45_4 +; NO_SVE-NEXT: .LBB45_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB45_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB45_9 +; NO_SVE-NEXT: .LBB45_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB45_10 +; NO_SVE-NEXT: .LBB45_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB45_11 +; NO_SVE-NEXT: b .LBB45_12 +; NO_SVE-NEXT: .LBB45_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB45_6 +; NO_SVE-NEXT: .LBB45_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB45_7 +; NO_SVE-NEXT: .LBB45_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB45_12 +; NO_SVE-NEXT: .LBB45_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB45_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB45_17 +; NO_SVE-NEXT: .LBB45_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB45_18 +; NO_SVE-NEXT: .LBB45_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB45_19 +; NO_SVE-NEXT: b .LBB45_20 +; NO_SVE-NEXT: .LBB45_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB45_14 +; NO_SVE-NEXT: .LBB45_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB45_15 +; NO_SVE-NEXT: .LBB45_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB45_20 +; NO_SVE-NEXT: .LBB45_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB45_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB45_25 +; NO_SVE-NEXT: .LBB45_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB45_26 +; NO_SVE-NEXT: .LBB45_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB45_27 +; NO_SVE-NEXT: b .LBB45_28 +; NO_SVE-NEXT: .LBB45_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB45_22 +; NO_SVE-NEXT: .LBB45_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB45_23 +; NO_SVE-NEXT: .LBB45_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB45_28 +; NO_SVE-NEXT: .LBB45_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB45_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB45_33 +; NO_SVE-NEXT: .LBB45_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB45_34 +; NO_SVE-NEXT: .LBB45_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB45_35 +; NO_SVE-NEXT: b .LBB45_36 +; NO_SVE-NEXT: .LBB45_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB45_30 +; NO_SVE-NEXT: .LBB45_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB45_31 +; NO_SVE-NEXT: .LBB45_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB45_36 +; NO_SVE-NEXT: .LBB45_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB45_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB45_41 +; NO_SVE-NEXT: .LBB45_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB45_42 +; NO_SVE-NEXT: .LBB45_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB45_43 +; NO_SVE-NEXT: b .LBB45_44 +; NO_SVE-NEXT: .LBB45_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB45_38 +; NO_SVE-NEXT: .LBB45_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB45_39 +; NO_SVE-NEXT: .LBB45_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB45_44 +; NO_SVE-NEXT: .LBB45_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB45_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB45_49 +; NO_SVE-NEXT: .LBB45_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB45_50 +; NO_SVE-NEXT: .LBB45_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB45_51 +; NO_SVE-NEXT: b .LBB45_52 +; NO_SVE-NEXT: .LBB45_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB45_46 +; NO_SVE-NEXT: .LBB45_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB45_47 +; NO_SVE-NEXT: .LBB45_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB45_52 +; NO_SVE-NEXT: .LBB45_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB45_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB45_57 +; NO_SVE-NEXT: .LBB45_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB45_58 +; NO_SVE-NEXT: .LBB45_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB45_59 +; NO_SVE-NEXT: b .LBB45_60 +; NO_SVE-NEXT: .LBB45_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB45_54 +; NO_SVE-NEXT: .LBB45_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB45_55 +; NO_SVE-NEXT: .LBB45_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB45_60 +; NO_SVE-NEXT: .LBB45_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v18.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB45_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB45_63 +; NO_SVE-NEXT: .LBB45_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB45_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB45_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB45_62 +; NO_SVE-NEXT: b .LBB45_63 +; +; VBITS_EQ_256-LABEL: masked_gather_vec_plus_reg: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1162,6 +10886,464 @@ } define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_vec_plus_imm: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q3, [x0, #64] +; NO_SVE-NEXT: mov w9, #4 +; NO_SVE-NEXT: fcmeq v6.4s, v0.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v6.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldp q0, q7, [x0, #32] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v4.8h +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: orr w8, w10, w8, lsl #8 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #9 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: fcmeq v6.4s, v7.4s, #0.0 +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #10 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #11 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: orr w8, w8, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v6.8h +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w15, lsl #13 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w8, w12, lsl #15 +; NO_SVE-NEXT: orr w8, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #15 +; NO_SVE-NEXT: dup v2.2d, x9 +; NO_SVE-NEXT: bfi w8, w12, #16, #16 +; NO_SVE-NEXT: add v1.2d, v0.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB46_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB46_3 +; NO_SVE-NEXT: b .LBB46_4 +; NO_SVE-NEXT: .LBB46_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB46_4 +; NO_SVE-NEXT: .LBB46_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB46_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB46_9 +; NO_SVE-NEXT: .LBB46_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB46_10 +; NO_SVE-NEXT: .LBB46_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB46_11 +; NO_SVE-NEXT: b .LBB46_12 +; NO_SVE-NEXT: .LBB46_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB46_6 +; NO_SVE-NEXT: .LBB46_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB46_7 +; NO_SVE-NEXT: .LBB46_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB46_12 +; NO_SVE-NEXT: .LBB46_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB46_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB46_17 +; NO_SVE-NEXT: .LBB46_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB46_18 +; NO_SVE-NEXT: .LBB46_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB46_19 +; NO_SVE-NEXT: b .LBB46_20 +; NO_SVE-NEXT: .LBB46_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB46_14 +; NO_SVE-NEXT: .LBB46_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB46_15 +; NO_SVE-NEXT: .LBB46_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB46_20 +; NO_SVE-NEXT: .LBB46_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB46_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB46_25 +; NO_SVE-NEXT: .LBB46_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB46_26 +; NO_SVE-NEXT: .LBB46_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB46_27 +; NO_SVE-NEXT: b .LBB46_28 +; NO_SVE-NEXT: .LBB46_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB46_22 +; NO_SVE-NEXT: .LBB46_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB46_23 +; NO_SVE-NEXT: .LBB46_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB46_28 +; NO_SVE-NEXT: .LBB46_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB46_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB46_33 +; NO_SVE-NEXT: .LBB46_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB46_34 +; NO_SVE-NEXT: .LBB46_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB46_35 +; NO_SVE-NEXT: b .LBB46_36 +; NO_SVE-NEXT: .LBB46_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB46_30 +; NO_SVE-NEXT: .LBB46_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB46_31 +; NO_SVE-NEXT: .LBB46_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB46_36 +; NO_SVE-NEXT: .LBB46_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB46_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB46_41 +; NO_SVE-NEXT: .LBB46_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB46_42 +; NO_SVE-NEXT: .LBB46_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB46_43 +; NO_SVE-NEXT: b .LBB46_44 +; NO_SVE-NEXT: .LBB46_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB46_38 +; NO_SVE-NEXT: .LBB46_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB46_39 +; NO_SVE-NEXT: .LBB46_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB46_44 +; NO_SVE-NEXT: .LBB46_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB46_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB46_49 +; NO_SVE-NEXT: .LBB46_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB46_50 +; NO_SVE-NEXT: .LBB46_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB46_51 +; NO_SVE-NEXT: b .LBB46_52 +; NO_SVE-NEXT: .LBB46_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB46_46 +; NO_SVE-NEXT: .LBB46_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB46_47 +; NO_SVE-NEXT: .LBB46_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB46_52 +; NO_SVE-NEXT: .LBB46_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB46_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB46_57 +; NO_SVE-NEXT: .LBB46_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB46_58 +; NO_SVE-NEXT: .LBB46_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB46_59 +; NO_SVE-NEXT: b .LBB46_60 +; NO_SVE-NEXT: .LBB46_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB46_54 +; NO_SVE-NEXT: .LBB46_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB46_55 +; NO_SVE-NEXT: .LBB46_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB46_60 +; NO_SVE-NEXT: .LBB46_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v18.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB46_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB46_63 +; NO_SVE-NEXT: .LBB46_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB46_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB46_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB46_62 +; NO_SVE-NEXT: b .LBB46_63 +; +; VBITS_EQ_256-LABEL: masked_gather_vec_plus_imm: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d, #4] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d, #4] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d, #4] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d, #4] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1185,6 +11367,449 @@ } define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 { +; NO_SVE-LABEL: masked_gather_passthru: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q0, [x2] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbnz w8, #0, .LBB47_41 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB47_42 +; NO_SVE-NEXT: .LBB47_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB47_43 +; NO_SVE-NEXT: .LBB47_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB47_5 +; NO_SVE-NEXT: .LBB47_4: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_5: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: ldr q1, [x2, #16] +; NO_SVE-NEXT: tbnz w8, #4, .LBB47_44 +; NO_SVE-NEXT: // %bb.6: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB47_45 +; NO_SVE-NEXT: .LBB47_7: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB47_46 +; NO_SVE-NEXT: .LBB47_8: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB47_10 +; NO_SVE-NEXT: .LBB47_9: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_10: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: ldr q2, [x2, #32] +; NO_SVE-NEXT: tbnz w8, #8, .LBB47_47 +; NO_SVE-NEXT: // %bb.11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB47_48 +; NO_SVE-NEXT: .LBB47_12: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB47_49 +; NO_SVE-NEXT: .LBB47_13: // %else29 +; NO_SVE-NEXT: tbz w8, #11, .LBB47_15 +; NO_SVE-NEXT: .LBB47_14: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_15: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: ldr q3, [x2, #48] +; NO_SVE-NEXT: tbnz w8, #12, .LBB47_50 +; NO_SVE-NEXT: // %bb.16: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB47_51 +; NO_SVE-NEXT: .LBB47_17: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB47_52 +; NO_SVE-NEXT: .LBB47_18: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB47_20 +; NO_SVE-NEXT: .LBB47_19: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_20: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: ldr q4, [x2, #64] +; NO_SVE-NEXT: tbnz w8, #16, .LBB47_53 +; NO_SVE-NEXT: // %bb.21: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB47_54 +; NO_SVE-NEXT: .LBB47_22: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB47_55 +; NO_SVE-NEXT: .LBB47_23: // %else53 +; NO_SVE-NEXT: tbz w8, #19, .LBB47_25 +; NO_SVE-NEXT: .LBB47_24: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_25: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: ldr q5, [x2, #80] +; NO_SVE-NEXT: tbnz w8, #20, .LBB47_56 +; NO_SVE-NEXT: // %bb.26: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB47_57 +; NO_SVE-NEXT: .LBB47_27: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB47_58 +; NO_SVE-NEXT: .LBB47_28: // %else65 +; NO_SVE-NEXT: tbz w8, #23, .LBB47_30 +; NO_SVE-NEXT: .LBB47_29: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_30: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: ldr q6, [x2, #96] +; NO_SVE-NEXT: tbnz w8, #24, .LBB47_59 +; NO_SVE-NEXT: // %bb.31: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB47_60 +; NO_SVE-NEXT: .LBB47_32: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB47_61 +; NO_SVE-NEXT: .LBB47_33: // %else77 +; NO_SVE-NEXT: tbz w8, #27, .LBB47_35 +; NO_SVE-NEXT: .LBB47_34: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_35: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: ldr q7, [x2, #112] +; NO_SVE-NEXT: tbnz w8, #28, .LBB47_62 +; NO_SVE-NEXT: // %bb.36: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB47_63 +; NO_SVE-NEXT: .LBB47_37: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB47_64 +; NO_SVE-NEXT: .LBB47_38: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB47_40 +; NO_SVE-NEXT: .LBB47_39: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB47_40: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB47_41: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB47_2 +; NO_SVE-NEXT: .LBB47_42: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB47_3 +; NO_SVE-NEXT: .LBB47_43: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB47_4 +; NO_SVE-NEXT: b .LBB47_5 +; NO_SVE-NEXT: .LBB47_44: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB47_7 +; NO_SVE-NEXT: .LBB47_45: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB47_8 +; NO_SVE-NEXT: .LBB47_46: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB47_9 +; NO_SVE-NEXT: b .LBB47_10 +; NO_SVE-NEXT: .LBB47_47: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB47_12 +; NO_SVE-NEXT: .LBB47_48: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB47_13 +; NO_SVE-NEXT: .LBB47_49: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB47_14 +; NO_SVE-NEXT: b .LBB47_15 +; NO_SVE-NEXT: .LBB47_50: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB47_17 +; NO_SVE-NEXT: .LBB47_51: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB47_18 +; NO_SVE-NEXT: .LBB47_52: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB47_19 +; NO_SVE-NEXT: b .LBB47_20 +; NO_SVE-NEXT: .LBB47_53: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB47_22 +; NO_SVE-NEXT: .LBB47_54: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB47_23 +; NO_SVE-NEXT: .LBB47_55: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB47_24 +; NO_SVE-NEXT: b .LBB47_25 +; NO_SVE-NEXT: .LBB47_56: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB47_27 +; NO_SVE-NEXT: .LBB47_57: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB47_28 +; NO_SVE-NEXT: .LBB47_58: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB47_29 +; NO_SVE-NEXT: b .LBB47_30 +; NO_SVE-NEXT: .LBB47_59: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB47_32 +; NO_SVE-NEXT: .LBB47_60: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB47_33 +; NO_SVE-NEXT: .LBB47_61: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB47_34 +; NO_SVE-NEXT: b .LBB47_35 +; NO_SVE-NEXT: .LBB47_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB47_37 +; NO_SVE-NEXT: .LBB47_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB47_38 +; NO_SVE-NEXT: .LBB47_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB47_39 +; NO_SVE-NEXT: b .LBB47_40 +; +; VBITS_EQ_256-LABEL: masked_gather_passthru: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z20.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z4.s, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: punpklo p3.h, p2.b +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p0/z, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x2] +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p3/z, [z23.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z17.s, #0.0 +; VBITS_EQ_256-NEXT: mov z17.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: bif v4.16b, v16.16b, v17.16b +; VBITS_EQ_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z17.s +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z22.d }, p4/z, [z22.d] +; VBITS_EQ_256-NEXT: ld1w { z21.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z20.s, #0.0 +; VBITS_EQ_256-NEXT: mov z20.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p2.b +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z21.s, z21.s, z21.s +; VBITS_EQ_256-NEXT: uzp1 z22.s, z22.s, z22.s +; VBITS_EQ_256-NEXT: bif v21.16b, v5.16b, v20.16b +; VBITS_EQ_256-NEXT: ext z20.b, z20.b, z20.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z20.s +; VBITS_EQ_256-NEXT: ext z5.b, z5.b, z5.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z19.d }, p4/z, [z19.d] +; VBITS_EQ_256-NEXT: ld1w { z18.d }, p3/z, [z18.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z7.s, #0.0 +; VBITS_EQ_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z18.s, z18.s, z18.s +; VBITS_EQ_256-NEXT: bif v18.16b, v1.16b, v7.16b +; VBITS_EQ_256-NEXT: ext z7.b, z7.b, z7.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z7.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: mov z23.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p4/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: bit v16.16b, v22.16b, v17.16b +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: bif v3.16b, v6.16b, v23.16b +; VBITS_EQ_256-NEXT: ext z23.b, z23.b, z23.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z17.d, z23.s +; VBITS_EQ_256-NEXT: ext z6.b, z6.b, z6.b, #16 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z17.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z17.s, z19.s, z19.s +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: bit v5.16b, v17.16b, v20.16b +; VBITS_EQ_256-NEXT: splice z4.s, p1, z4.s, z16.s +; VBITS_EQ_256-NEXT: bit v1.16b, v2.16b, v7.16b +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: bif v0.16b, v6.16b, v23.16b +; VBITS_EQ_256-NEXT: splice z21.s, p1, z21.s, z5.s +; VBITS_EQ_256-NEXT: splice z18.s, p1, z18.s, z1.s +; VBITS_EQ_256-NEXT: st1w { z21.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z18.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1209,6 +11834,447 @@ } define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_passthru_0: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v4.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v4.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v4.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v2.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v4.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q3, [x1] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB48_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x9] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB48_3 +; NO_SVE-NEXT: b .LBB48_4 +; NO_SVE-NEXT: .LBB48_2: +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB48_4 +; NO_SVE-NEXT: .LBB48_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB48_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB48_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB48_13 +; NO_SVE-NEXT: .LBB48_6: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB48_14 +; NO_SVE-NEXT: .LBB48_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB48_15 +; NO_SVE-NEXT: .LBB48_8: // %else14 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB48_16 +; NO_SVE-NEXT: .LBB48_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB48_17 +; NO_SVE-NEXT: .LBB48_10: // %else20 +; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB48_18 +; NO_SVE-NEXT: .LBB48_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v17.s }[0], [x9] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v17.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB48_19 +; NO_SVE-NEXT: b .LBB48_20 +; NO_SVE-NEXT: .LBB48_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB48_6 +; NO_SVE-NEXT: .LBB48_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB48_7 +; NO_SVE-NEXT: .LBB48_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB48_8 +; NO_SVE-NEXT: .LBB48_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB48_9 +; NO_SVE-NEXT: .LBB48_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB48_10 +; NO_SVE-NEXT: .LBB48_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB48_11 +; NO_SVE-NEXT: .LBB48_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB48_20 +; NO_SVE-NEXT: .LBB48_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB48_20: // %else26 +; NO_SVE-NEXT: ldr q16, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB48_44 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB48_45 +; NO_SVE-NEXT: .LBB48_22: // %else32 +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB48_46 +; NO_SVE-NEXT: .LBB48_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB48_47 +; NO_SVE-NEXT: .LBB48_24: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB48_48 +; NO_SVE-NEXT: .LBB48_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB48_49 +; NO_SVE-NEXT: .LBB48_26: // %else44 +; NO_SVE-NEXT: ldr q16, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB48_50 +; NO_SVE-NEXT: .LBB48_27: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB48_51 +; NO_SVE-NEXT: .LBB48_28: // %else50 +; NO_SVE-NEXT: ldr q16, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB48_52 +; NO_SVE-NEXT: .LBB48_29: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB48_53 +; NO_SVE-NEXT: .LBB48_30: // %else56 +; NO_SVE-NEXT: ldr q16, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB48_54 +; NO_SVE-NEXT: .LBB48_31: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB48_55 +; NO_SVE-NEXT: .LBB48_32: // %else62 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB48_56 +; NO_SVE-NEXT: .LBB48_33: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB48_57 +; NO_SVE-NEXT: .LBB48_34: // %else68 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB48_58 +; NO_SVE-NEXT: .LBB48_35: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB48_59 +; NO_SVE-NEXT: .LBB48_36: // %else74 +; NO_SVE-NEXT: ldr q16, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB48_60 +; NO_SVE-NEXT: .LBB48_37: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB48_61 +; NO_SVE-NEXT: .LBB48_38: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB48_62 +; NO_SVE-NEXT: .LBB48_39: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB48_63 +; NO_SVE-NEXT: .LBB48_40: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB48_64 +; NO_SVE-NEXT: .LBB48_41: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB48_43 +; NO_SVE-NEXT: .LBB48_42: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB48_43: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB48_44: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB48_22 +; NO_SVE-NEXT: .LBB48_45: // %cond.load31 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB48_23 +; NO_SVE-NEXT: .LBB48_46: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB48_24 +; NO_SVE-NEXT: .LBB48_47: // %cond.load37 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB48_25 +; NO_SVE-NEXT: .LBB48_48: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB48_26 +; NO_SVE-NEXT: .LBB48_49: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB48_27 +; NO_SVE-NEXT: .LBB48_50: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB48_28 +; NO_SVE-NEXT: .LBB48_51: // %cond.load49 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB48_29 +; NO_SVE-NEXT: .LBB48_52: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB48_30 +; NO_SVE-NEXT: .LBB48_53: // %cond.load55 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB48_31 +; NO_SVE-NEXT: .LBB48_54: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB48_32 +; NO_SVE-NEXT: .LBB48_55: // %cond.load61 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB48_33 +; NO_SVE-NEXT: .LBB48_56: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB48_34 +; NO_SVE-NEXT: .LBB48_57: // %cond.load67 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB48_35 +; NO_SVE-NEXT: .LBB48_58: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB48_36 +; NO_SVE-NEXT: .LBB48_59: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB48_37 +; NO_SVE-NEXT: .LBB48_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB48_38 +; NO_SVE-NEXT: .LBB48_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB48_39 +; NO_SVE-NEXT: .LBB48_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB48_40 +; NO_SVE-NEXT: .LBB48_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB48_41 +; NO_SVE-NEXT: .LBB48_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB48_42 +; NO_SVE-NEXT: b .LBB48_43 +; +; VBITS_EQ_256-LABEL: masked_gather_passthru_0: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru_0: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK @@ -24,6 +25,34 @@ ; Masked Loads ; define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s1, [x0] +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ldr s2, [x1] +; NO_SVE-NEXT: fcmeq v1.4h, v1.4h, v2.4h +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[0] +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB0_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.h }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: b .LBB0_3 +; NO_SVE-NEXT: .LBB0_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: .LBB0_3: // %cond.load1 +; NO_SVE-NEXT: add x8, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x8] +; NO_SVE-NEXT: .LBB0_4: // %else2 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] @@ -55,6 +84,34 @@ } define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d1, [x0] +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ldr d2, [x1] +; NO_SVE-NEXT: fcmeq v1.2s, v1.2s, v2.2s +; NO_SVE-NEXT: mov w8, v1.s[1] +; NO_SVE-NEXT: fmov w9, s1 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB1_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB1_4 +; NO_SVE-NEXT: b .LBB1_3 +; NO_SVE-NEXT: .LBB1_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB1_4 +; NO_SVE-NEXT: .LBB1_3: // %cond.load1 +; NO_SVE-NEXT: add x8, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB1_4: // %else2 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -73,6 +130,53 @@ } define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q1, [x0] +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: xtn v1.4h, v1.4s +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: umov w10, v1.h[0] +; NO_SVE-NEXT: umov w11, v1.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB2_5 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB2_6 +; NO_SVE-NEXT: .LBB2_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB2_7 +; NO_SVE-NEXT: .LBB2_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB2_8 +; NO_SVE-NEXT: .LBB2_4: // %else8 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB2_5: // %cond.load +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB2_2 +; NO_SVE-NEXT: .LBB2_6: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB2_3 +; NO_SVE-NEXT: .LBB2_7: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB2_4 +; NO_SVE-NEXT: .LBB2_8: // %cond.load7 +; NO_SVE-NEXT: add x8, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -91,6 +195,94 @@ } define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: ldp q4, q3, [x1] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v4.4s +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w9, w13, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w10, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #5, #1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB3_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB3_10 +; NO_SVE-NEXT: .LBB3_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB3_11 +; NO_SVE-NEXT: .LBB3_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB3_12 +; NO_SVE-NEXT: .LBB3_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB3_13 +; NO_SVE-NEXT: .LBB3_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB3_14 +; NO_SVE-NEXT: .LBB3_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB3_15 +; NO_SVE-NEXT: .LBB3_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB3_16 +; NO_SVE-NEXT: .LBB3_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB3_9: // %cond.load +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB3_2 +; NO_SVE-NEXT: .LBB3_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB3_3 +; NO_SVE-NEXT: .LBB3_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB3_4 +; NO_SVE-NEXT: .LBB3_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB3_5 +; NO_SVE-NEXT: .LBB3_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB3_6 +; NO_SVE-NEXT: .LBB3_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB3_7 +; NO_SVE-NEXT: .LBB3_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB3_8 +; NO_SVE-NEXT: .LBB3_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -108,6 +300,176 @@ } define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v3.4s +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v4.4s +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB4_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB4_11 +; NO_SVE-NEXT: .LBB4_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB4_12 +; NO_SVE-NEXT: .LBB4_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB4_13 +; NO_SVE-NEXT: .LBB4_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB4_14 +; NO_SVE-NEXT: .LBB4_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB4_15 +; NO_SVE-NEXT: .LBB4_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB4_16 +; NO_SVE-NEXT: .LBB4_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB4_17 +; NO_SVE-NEXT: .LBB4_8: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB4_18 +; NO_SVE-NEXT: .LBB4_9: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: mov v2.16b, v4.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB4_19 +; NO_SVE-NEXT: b .LBB4_20 +; NO_SVE-NEXT: .LBB4_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB4_2 +; NO_SVE-NEXT: .LBB4_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB4_3 +; NO_SVE-NEXT: .LBB4_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB4_4 +; NO_SVE-NEXT: .LBB4_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB4_5 +; NO_SVE-NEXT: .LBB4_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB4_6 +; NO_SVE-NEXT: .LBB4_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB4_7 +; NO_SVE-NEXT: .LBB4_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB4_8 +; NO_SVE-NEXT: .LBB4_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB4_9 +; NO_SVE-NEXT: .LBB4_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB4_20 +; NO_SVE-NEXT: .LBB4_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB4_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB4_27 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB4_28 +; NO_SVE-NEXT: .LBB4_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB4_29 +; NO_SVE-NEXT: .LBB4_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB4_30 +; NO_SVE-NEXT: .LBB4_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB4_31 +; NO_SVE-NEXT: .LBB4_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB4_32 +; NO_SVE-NEXT: .LBB4_26: // %else44 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB4_27: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB4_22 +; NO_SVE-NEXT: .LBB4_28: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB4_23 +; NO_SVE-NEXT: .LBB4_29: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB4_24 +; NO_SVE-NEXT: .LBB4_30: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB4_25 +; NO_SVE-NEXT: .LBB4_31: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB4_26 +; NO_SVE-NEXT: .LBB4_32: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -125,6 +487,338 @@ } define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v3.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #96] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ldp q6, q7, [x1, #96] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v6.4s +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v7.4s +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q16, q1, [x1] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[1] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v16.4s +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: fcmeq v1.4s, v5.4s, v1.4s +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: ldp q3, q5, [x0, #32] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v4.8h, v1.8h +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #8 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w8, w8, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, v1.4s +; NO_SVE-NEXT: orr w8, w8, w13, lsl #11 +; NO_SVE-NEXT: fcmeq v4.4s, v5.4s, v4.4s +; NO_SVE-NEXT: orr w8, w8, w14, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w9, v2.b[5] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #0, .LBB5_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB5_11 +; NO_SVE-NEXT: .LBB5_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB5_12 +; NO_SVE-NEXT: .LBB5_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB5_13 +; NO_SVE-NEXT: .LBB5_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB5_14 +; NO_SVE-NEXT: .LBB5_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB5_15 +; NO_SVE-NEXT: .LBB5_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB5_16 +; NO_SVE-NEXT: .LBB5_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB5_17 +; NO_SVE-NEXT: .LBB5_8: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB5_18 +; NO_SVE-NEXT: .LBB5_9: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v16.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB5_19 +; NO_SVE-NEXT: b .LBB5_20 +; NO_SVE-NEXT: .LBB5_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB5_2 +; NO_SVE-NEXT: .LBB5_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB5_3 +; NO_SVE-NEXT: .LBB5_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB5_4 +; NO_SVE-NEXT: .LBB5_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB5_5 +; NO_SVE-NEXT: .LBB5_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB5_6 +; NO_SVE-NEXT: .LBB5_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB5_7 +; NO_SVE-NEXT: .LBB5_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB5_8 +; NO_SVE-NEXT: .LBB5_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB5_9 +; NO_SVE-NEXT: .LBB5_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB5_20 +; NO_SVE-NEXT: .LBB5_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB5_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB5_43 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB5_44 +; NO_SVE-NEXT: .LBB5_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB5_45 +; NO_SVE-NEXT: .LBB5_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB5_46 +; NO_SVE-NEXT: .LBB5_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB5_47 +; NO_SVE-NEXT: .LBB5_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB5_48 +; NO_SVE-NEXT: .LBB5_26: // %else44 +; NO_SVE-NEXT: tbnz w8, #16, .LBB5_49 +; NO_SVE-NEXT: .LBB5_27: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB5_50 +; NO_SVE-NEXT: .LBB5_28: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB5_51 +; NO_SVE-NEXT: .LBB5_29: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB5_52 +; NO_SVE-NEXT: .LBB5_30: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB5_53 +; NO_SVE-NEXT: .LBB5_31: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB5_54 +; NO_SVE-NEXT: .LBB5_32: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB5_55 +; NO_SVE-NEXT: .LBB5_33: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB5_56 +; NO_SVE-NEXT: .LBB5_34: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB5_57 +; NO_SVE-NEXT: .LBB5_35: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB5_58 +; NO_SVE-NEXT: .LBB5_36: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB5_59 +; NO_SVE-NEXT: .LBB5_37: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB5_60 +; NO_SVE-NEXT: .LBB5_38: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB5_61 +; NO_SVE-NEXT: .LBB5_39: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB5_62 +; NO_SVE-NEXT: .LBB5_40: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB5_63 +; NO_SVE-NEXT: .LBB5_41: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB5_64 +; NO_SVE-NEXT: .LBB5_42: // %else92 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB5_43: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB5_22 +; NO_SVE-NEXT: .LBB5_44: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB5_23 +; NO_SVE-NEXT: .LBB5_45: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB5_24 +; NO_SVE-NEXT: .LBB5_46: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB5_25 +; NO_SVE-NEXT: .LBB5_47: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB5_26 +; NO_SVE-NEXT: .LBB5_48: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #16, .LBB5_27 +; NO_SVE-NEXT: .LBB5_49: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB5_28 +; NO_SVE-NEXT: .LBB5_50: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #18, .LBB5_29 +; NO_SVE-NEXT: .LBB5_51: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB5_30 +; NO_SVE-NEXT: .LBB5_52: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB5_31 +; NO_SVE-NEXT: .LBB5_53: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB5_32 +; NO_SVE-NEXT: .LBB5_54: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB5_33 +; NO_SVE-NEXT: .LBB5_55: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB5_34 +; NO_SVE-NEXT: .LBB5_56: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB5_35 +; NO_SVE-NEXT: .LBB5_57: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB5_36 +; NO_SVE-NEXT: .LBB5_58: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB5_37 +; NO_SVE-NEXT: .LBB5_59: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB5_38 +; NO_SVE-NEXT: .LBB5_60: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB5_39 +; NO_SVE-NEXT: .LBB5_61: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB5_40 +; NO_SVE-NEXT: .LBB5_62: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB5_41 +; NO_SVE-NEXT: .LBB5_63: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB5_42 +; NO_SVE-NEXT: .LBB5_64: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_load_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -142,6 +836,671 @@ } define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: ldp q4, q5, [x1, #192] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v4.4s +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v5.4s +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q6, q7, [x1, #224] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: fcmeq v6.4s, v0.4s, v6.4s +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: fcmeq v7.4s, v1.4s, v7.4s +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: ldp q16, q17, [x0, #128] +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: ldp q21, q22, [x1, #128] +; NO_SVE-NEXT: xtn v18.8b, v3.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w11, v18.b[0] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w15, v18.b[6] +; NO_SVE-NEXT: fcmeq v21.4s, v16.4s, v21.4s +; NO_SVE-NEXT: bfi w10, w9, #5, #1 +; NO_SVE-NEXT: fcmeq v22.4s, v17.4s, v22.4s +; NO_SVE-NEXT: orr w9, w10, w12, lsl #6 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[1] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w12, v18.b[2] +; NO_SVE-NEXT: ldp q23, q24, [x0, #160] +; NO_SVE-NEXT: uzp1 v25.8h, v21.8h, v22.8h +; NO_SVE-NEXT: orr w9, w9, w13, lsl #7 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #9 +; NO_SVE-NEXT: xtn v25.8b, v25.8h +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: ldp q28, q29, [x1, #160] +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[4] +; NO_SVE-NEXT: umov w12, v25.b[1] +; NO_SVE-NEXT: umov w13, v25.b[0] +; NO_SVE-NEXT: umov w14, v25.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #11 +; NO_SVE-NEXT: fcmeq v23.4s, v23.4s, v28.4s +; NO_SVE-NEXT: umov w10, v18.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v25.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #12 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v25.b[3] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v25.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: fcmeq v24.4s, v24.4s, v29.4s +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: ldp q19, q20, [x0, #64] +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v25.b[5] +; NO_SVE-NEXT: uzp1 v23.8h, v23.8h, v24.8h +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w15, v18.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: ldp q26, q27, [x1, #64] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: xtn v23.8b, v23.8h +; NO_SVE-NEXT: umov w14, v25.b[7] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #15 +; NO_SVE-NEXT: umov w15, v23.b[6] +; NO_SVE-NEXT: fcmeq v19.4s, v19.4s, v26.4s +; NO_SVE-NEXT: bfi w11, w12, #5, #1 +; NO_SVE-NEXT: umov w12, v23.b[0] +; NO_SVE-NEXT: orr w10, w11, w13, lsl #6 +; NO_SVE-NEXT: fcmeq v20.4s, v20.4s, v27.4s +; NO_SVE-NEXT: umov w13, v23.b[1] +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v23.b[2] +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #7 +; NO_SVE-NEXT: uzp1 v19.8h, v19.8h, v20.8h +; NO_SVE-NEXT: umov w11, v23.b[3] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v23.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v18.8b, v19.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: ldp q21, q22, [x1, #96] +; NO_SVE-NEXT: umov w12, v18.b[1] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v23.b[5] +; NO_SVE-NEXT: umov w16, v18.b[0] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v18.b[2] +; NO_SVE-NEXT: umov w17, v18.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: fcmeq v7.4s, v7.4s, v22.4s +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, v21.4s +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #1, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v18.b[3] +; NO_SVE-NEXT: umov w16, v18.b[4] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: uzp1 v6.8h, v6.8h, v7.8h +; NO_SVE-NEXT: bfi w13, w12, #2, #1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: umov w11, v18.b[7] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v18.b[6] +; NO_SVE-NEXT: xtn v6.8b, v6.8h +; NO_SVE-NEXT: orr w10, w10, w15, lsl #14 +; NO_SVE-NEXT: ldp q16, q17, [x1] +; NO_SVE-NEXT: bfi w13, w12, #3, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w13, w14, #4, #1 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[0] +; NO_SVE-NEXT: bfi w13, w16, #5, #1 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v16.4s +; NO_SVE-NEXT: orr w12, w13, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v6.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, v17.4s +; NO_SVE-NEXT: umov w16, v6.b[3] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[2] +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: umov w15, v23.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w17, v4.b[1] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: umov w15, v4.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v3.4s +; NO_SVE-NEXT: umov w16, v4.b[3] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[4] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w18, v4.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w14, w13, #1, #1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: bfi w14, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[6] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: bfi w14, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v4.b[7] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w14, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v6.b[6] +; NO_SVE-NEXT: bfi w14, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w14, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v6.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w9, #0, .LBB6_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w9, #1, .LBB6_11 +; NO_SVE-NEXT: .LBB6_2: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB6_12 +; NO_SVE-NEXT: .LBB6_3: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB6_13 +; NO_SVE-NEXT: .LBB6_4: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB6_14 +; NO_SVE-NEXT: .LBB6_5: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB6_15 +; NO_SVE-NEXT: .LBB6_6: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB6_16 +; NO_SVE-NEXT: .LBB6_7: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB6_17 +; NO_SVE-NEXT: .LBB6_8: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB6_18 +; NO_SVE-NEXT: .LBB6_9: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: mov v24.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v24.s }[0], [x10] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v18.16b, v2.16b +; NO_SVE-NEXT: mov v19.16b, v2.16b +; NO_SVE-NEXT: mov v20.16b, v2.16b +; NO_SVE-NEXT: mov v21.16b, v2.16b +; NO_SVE-NEXT: mov v22.16b, v2.16b +; NO_SVE-NEXT: mov v23.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v24.16b +; NO_SVE-NEXT: tbnz w9, #9, .LBB6_19 +; NO_SVE-NEXT: b .LBB6_20 +; NO_SVE-NEXT: .LBB6_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w9, #1, .LBB6_2 +; NO_SVE-NEXT: .LBB6_11: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #2, .LBB6_3 +; NO_SVE-NEXT: .LBB6_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB6_4 +; NO_SVE-NEXT: .LBB6_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB6_5 +; NO_SVE-NEXT: .LBB6_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB6_6 +; NO_SVE-NEXT: .LBB6_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB6_7 +; NO_SVE-NEXT: .LBB6_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB6_8 +; NO_SVE-NEXT: .LBB6_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB6_9 +; NO_SVE-NEXT: .LBB6_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v18.16b, v2.16b +; NO_SVE-NEXT: mov v19.16b, v2.16b +; NO_SVE-NEXT: mov v20.16b, v2.16b +; NO_SVE-NEXT: mov v21.16b, v2.16b +; NO_SVE-NEXT: mov v22.16b, v2.16b +; NO_SVE-NEXT: mov v23.16b, v2.16b +; NO_SVE-NEXT: tbz w9, #9, .LBB6_20 +; NO_SVE-NEXT: .LBB6_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB6_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB6_76 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB6_77 +; NO_SVE-NEXT: .LBB6_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB6_78 +; NO_SVE-NEXT: .LBB6_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB6_79 +; NO_SVE-NEXT: .LBB6_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB6_80 +; NO_SVE-NEXT: .LBB6_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB6_81 +; NO_SVE-NEXT: .LBB6_26: // %else44 +; NO_SVE-NEXT: tbnz w9, #16, .LBB6_82 +; NO_SVE-NEXT: .LBB6_27: // %else47 +; NO_SVE-NEXT: tbnz w9, #17, .LBB6_83 +; NO_SVE-NEXT: .LBB6_28: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB6_84 +; NO_SVE-NEXT: .LBB6_29: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB6_85 +; NO_SVE-NEXT: .LBB6_30: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB6_86 +; NO_SVE-NEXT: .LBB6_31: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB6_87 +; NO_SVE-NEXT: .LBB6_32: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB6_88 +; NO_SVE-NEXT: .LBB6_33: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB6_89 +; NO_SVE-NEXT: .LBB6_34: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB6_90 +; NO_SVE-NEXT: .LBB6_35: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB6_91 +; NO_SVE-NEXT: .LBB6_36: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB6_92 +; NO_SVE-NEXT: .LBB6_37: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB6_93 +; NO_SVE-NEXT: .LBB6_38: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB6_94 +; NO_SVE-NEXT: .LBB6_39: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB6_95 +; NO_SVE-NEXT: .LBB6_40: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB6_96 +; NO_SVE-NEXT: .LBB6_41: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB6_97 +; NO_SVE-NEXT: .LBB6_42: // %else92 +; NO_SVE-NEXT: tbnz x9, #32, .LBB6_98 +; NO_SVE-NEXT: .LBB6_43: // %else95 +; NO_SVE-NEXT: tbnz x9, #33, .LBB6_99 +; NO_SVE-NEXT: .LBB6_44: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB6_100 +; NO_SVE-NEXT: .LBB6_45: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB6_101 +; NO_SVE-NEXT: .LBB6_46: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB6_102 +; NO_SVE-NEXT: .LBB6_47: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB6_103 +; NO_SVE-NEXT: .LBB6_48: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB6_104 +; NO_SVE-NEXT: .LBB6_49: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB6_105 +; NO_SVE-NEXT: .LBB6_50: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB6_106 +; NO_SVE-NEXT: .LBB6_51: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB6_107 +; NO_SVE-NEXT: .LBB6_52: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB6_108 +; NO_SVE-NEXT: .LBB6_53: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB6_109 +; NO_SVE-NEXT: .LBB6_54: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB6_110 +; NO_SVE-NEXT: .LBB6_55: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB6_111 +; NO_SVE-NEXT: .LBB6_56: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB6_112 +; NO_SVE-NEXT: .LBB6_57: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB6_113 +; NO_SVE-NEXT: .LBB6_58: // %else140 +; NO_SVE-NEXT: tbnz x9, #48, .LBB6_114 +; NO_SVE-NEXT: .LBB6_59: // %else143 +; NO_SVE-NEXT: tbnz x9, #49, .LBB6_115 +; NO_SVE-NEXT: .LBB6_60: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB6_116 +; NO_SVE-NEXT: .LBB6_61: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB6_117 +; NO_SVE-NEXT: .LBB6_62: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB6_118 +; NO_SVE-NEXT: .LBB6_63: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB6_119 +; NO_SVE-NEXT: .LBB6_64: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB6_120 +; NO_SVE-NEXT: .LBB6_65: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB6_121 +; NO_SVE-NEXT: .LBB6_66: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB6_122 +; NO_SVE-NEXT: .LBB6_67: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB6_123 +; NO_SVE-NEXT: .LBB6_68: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB6_124 +; NO_SVE-NEXT: .LBB6_69: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB6_125 +; NO_SVE-NEXT: .LBB6_70: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB6_126 +; NO_SVE-NEXT: .LBB6_71: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB6_127 +; NO_SVE-NEXT: .LBB6_72: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB6_128 +; NO_SVE-NEXT: .LBB6_73: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB6_75 +; NO_SVE-NEXT: .LBB6_74: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #252 +; NO_SVE-NEXT: ld1 { v23.s }[3], [x9] +; NO_SVE-NEXT: .LBB6_75: // %else188 +; NO_SVE-NEXT: stp q0, q1, [x8] +; NO_SVE-NEXT: stp q2, q3, [x8, #32] +; NO_SVE-NEXT: stp q4, q5, [x8, #64] +; NO_SVE-NEXT: stp q6, q7, [x8, #96] +; NO_SVE-NEXT: stp q16, q17, [x8, #128] +; NO_SVE-NEXT: stp q18, q19, [x8, #160] +; NO_SVE-NEXT: stp q20, q21, [x8, #192] +; NO_SVE-NEXT: stp q22, q23, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB6_76: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB6_22 +; NO_SVE-NEXT: .LBB6_77: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB6_23 +; NO_SVE-NEXT: .LBB6_78: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB6_24 +; NO_SVE-NEXT: .LBB6_79: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB6_25 +; NO_SVE-NEXT: .LBB6_80: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB6_26 +; NO_SVE-NEXT: .LBB6_81: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #16, .LBB6_27 +; NO_SVE-NEXT: .LBB6_82: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #17, .LBB6_28 +; NO_SVE-NEXT: .LBB6_83: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #18, .LBB6_29 +; NO_SVE-NEXT: .LBB6_84: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB6_30 +; NO_SVE-NEXT: .LBB6_85: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB6_31 +; NO_SVE-NEXT: .LBB6_86: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB6_32 +; NO_SVE-NEXT: .LBB6_87: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB6_33 +; NO_SVE-NEXT: .LBB6_88: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB6_34 +; NO_SVE-NEXT: .LBB6_89: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB6_35 +; NO_SVE-NEXT: .LBB6_90: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB6_36 +; NO_SVE-NEXT: .LBB6_91: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB6_37 +; NO_SVE-NEXT: .LBB6_92: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB6_38 +; NO_SVE-NEXT: .LBB6_93: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB6_39 +; NO_SVE-NEXT: .LBB6_94: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB6_40 +; NO_SVE-NEXT: .LBB6_95: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB6_41 +; NO_SVE-NEXT: .LBB6_96: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB6_42 +; NO_SVE-NEXT: .LBB6_97: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #32, .LBB6_43 +; NO_SVE-NEXT: .LBB6_98: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #128 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #33, .LBB6_44 +; NO_SVE-NEXT: .LBB6_99: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #132 +; NO_SVE-NEXT: ld1 { v16.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #34, .LBB6_45 +; NO_SVE-NEXT: .LBB6_100: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #136 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB6_46 +; NO_SVE-NEXT: .LBB6_101: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #140 +; NO_SVE-NEXT: ld1 { v16.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB6_47 +; NO_SVE-NEXT: .LBB6_102: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #144 +; NO_SVE-NEXT: ld1 { v17.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB6_48 +; NO_SVE-NEXT: .LBB6_103: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #148 +; NO_SVE-NEXT: ld1 { v17.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB6_49 +; NO_SVE-NEXT: .LBB6_104: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #152 +; NO_SVE-NEXT: ld1 { v17.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB6_50 +; NO_SVE-NEXT: .LBB6_105: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #156 +; NO_SVE-NEXT: ld1 { v17.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB6_51 +; NO_SVE-NEXT: .LBB6_106: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #160 +; NO_SVE-NEXT: ld1 { v18.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB6_52 +; NO_SVE-NEXT: .LBB6_107: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #164 +; NO_SVE-NEXT: ld1 { v18.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB6_53 +; NO_SVE-NEXT: .LBB6_108: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #168 +; NO_SVE-NEXT: ld1 { v18.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB6_54 +; NO_SVE-NEXT: .LBB6_109: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #172 +; NO_SVE-NEXT: ld1 { v18.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB6_55 +; NO_SVE-NEXT: .LBB6_110: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #176 +; NO_SVE-NEXT: ld1 { v19.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB6_56 +; NO_SVE-NEXT: .LBB6_111: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #180 +; NO_SVE-NEXT: ld1 { v19.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB6_57 +; NO_SVE-NEXT: .LBB6_112: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #184 +; NO_SVE-NEXT: ld1 { v19.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB6_58 +; NO_SVE-NEXT: .LBB6_113: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #188 +; NO_SVE-NEXT: ld1 { v19.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #48, .LBB6_59 +; NO_SVE-NEXT: .LBB6_114: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #192 +; NO_SVE-NEXT: ld1 { v20.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #49, .LBB6_60 +; NO_SVE-NEXT: .LBB6_115: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #196 +; NO_SVE-NEXT: ld1 { v20.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #50, .LBB6_61 +; NO_SVE-NEXT: .LBB6_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #200 +; NO_SVE-NEXT: ld1 { v20.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB6_62 +; NO_SVE-NEXT: .LBB6_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #204 +; NO_SVE-NEXT: ld1 { v20.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB6_63 +; NO_SVE-NEXT: .LBB6_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #208 +; NO_SVE-NEXT: ld1 { v21.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB6_64 +; NO_SVE-NEXT: .LBB6_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #212 +; NO_SVE-NEXT: ld1 { v21.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB6_65 +; NO_SVE-NEXT: .LBB6_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #216 +; NO_SVE-NEXT: ld1 { v21.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB6_66 +; NO_SVE-NEXT: .LBB6_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #220 +; NO_SVE-NEXT: ld1 { v21.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB6_67 +; NO_SVE-NEXT: .LBB6_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #224 +; NO_SVE-NEXT: ld1 { v22.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB6_68 +; NO_SVE-NEXT: .LBB6_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #228 +; NO_SVE-NEXT: ld1 { v22.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB6_69 +; NO_SVE-NEXT: .LBB6_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #232 +; NO_SVE-NEXT: ld1 { v22.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB6_70 +; NO_SVE-NEXT: .LBB6_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #236 +; NO_SVE-NEXT: ld1 { v22.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB6_71 +; NO_SVE-NEXT: .LBB6_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #240 +; NO_SVE-NEXT: ld1 { v23.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB6_72 +; NO_SVE-NEXT: .LBB6_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #244 +; NO_SVE-NEXT: ld1 { v23.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB6_73 +; NO_SVE-NEXT: .LBB6_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #248 +; NO_SVE-NEXT: ld1 { v23.s }[2], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB6_74 +; NO_SVE-NEXT: b .LBB6_75 +; ; VBITS_GE_2048-LABEL: masked_load_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -159,6 +1518,596 @@ } define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x1, #32] +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, v4.16b +; NO_SVE-NEXT: cmeq v3.16b, v2.16b, v5.16b +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w16, v4.b[8] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[9] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w8, v4.b[10] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[11] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v4.b[12] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w12, v4.b[13] +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w13, v4.b[14] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #12 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w12, w9, #1, #1 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[14] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w12, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v3.b[7] +; NO_SVE-NEXT: bfi w12, w9, #3, #1 +; NO_SVE-NEXT: and w9, w15, #0x1 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: bfi w12, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[8] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[9] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: orr w10, w12, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v3.b[10] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #7 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: ldp q2, q5, [x1] +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v3.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w14, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w13, v3.b[13] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: umov w15, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, v2.16b +; NO_SVE-NEXT: umov w17, v1.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: bfi w14, w10, #1, #1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: bfi w14, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[9] +; NO_SVE-NEXT: bfi w14, w10, #3, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w14, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[8] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #14 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: bfi w14, w10, #5, #1 +; NO_SVE-NEXT: orr w10, w14, w13, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #9 +; NO_SVE-NEXT: umov w14, v3.b[15] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[12] +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #16, #16 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[8] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w15, lsl #9 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w15, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w8, w11, w12, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w15, lsl #15 +; NO_SVE-NEXT: bfi w8, w10, #16, #16 +; NO_SVE-NEXT: bfi x8, x9, #32, #32 +; NO_SVE-NEXT: tbz w8, #0, .LBB7_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB7_3 +; NO_SVE-NEXT: b .LBB7_4 +; NO_SVE-NEXT: .LBB7_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB7_4 +; NO_SVE-NEXT: .LBB7_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB7_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB7_21 +; NO_SVE-NEXT: .LBB7_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB7_22 +; NO_SVE-NEXT: .LBB7_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB7_23 +; NO_SVE-NEXT: .LBB7_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB7_24 +; NO_SVE-NEXT: .LBB7_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB7_25 +; NO_SVE-NEXT: .LBB7_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB7_26 +; NO_SVE-NEXT: .LBB7_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB7_27 +; NO_SVE-NEXT: .LBB7_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB7_28 +; NO_SVE-NEXT: .LBB7_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB7_29 +; NO_SVE-NEXT: .LBB7_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB7_30 +; NO_SVE-NEXT: .LBB7_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB7_31 +; NO_SVE-NEXT: .LBB7_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB7_32 +; NO_SVE-NEXT: .LBB7_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB7_33 +; NO_SVE-NEXT: .LBB7_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB7_34 +; NO_SVE-NEXT: .LBB7_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB7_35 +; NO_SVE-NEXT: b .LBB7_36 +; NO_SVE-NEXT: .LBB7_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB7_6 +; NO_SVE-NEXT: .LBB7_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB7_7 +; NO_SVE-NEXT: .LBB7_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB7_8 +; NO_SVE-NEXT: .LBB7_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB7_9 +; NO_SVE-NEXT: .LBB7_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB7_10 +; NO_SVE-NEXT: .LBB7_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB7_11 +; NO_SVE-NEXT: .LBB7_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB7_12 +; NO_SVE-NEXT: .LBB7_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB7_13 +; NO_SVE-NEXT: .LBB7_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB7_14 +; NO_SVE-NEXT: .LBB7_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB7_15 +; NO_SVE-NEXT: .LBB7_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB7_16 +; NO_SVE-NEXT: .LBB7_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB7_17 +; NO_SVE-NEXT: .LBB7_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB7_18 +; NO_SVE-NEXT: .LBB7_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB7_19 +; NO_SVE-NEXT: .LBB7_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #17, .LBB7_36 +; NO_SVE-NEXT: .LBB7_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB7_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB7_53 +; NO_SVE-NEXT: .LBB7_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB7_54 +; NO_SVE-NEXT: .LBB7_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB7_55 +; NO_SVE-NEXT: .LBB7_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB7_56 +; NO_SVE-NEXT: .LBB7_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB7_57 +; NO_SVE-NEXT: .LBB7_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB7_58 +; NO_SVE-NEXT: .LBB7_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB7_59 +; NO_SVE-NEXT: .LBB7_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB7_60 +; NO_SVE-NEXT: .LBB7_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB7_61 +; NO_SVE-NEXT: .LBB7_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB7_62 +; NO_SVE-NEXT: .LBB7_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB7_63 +; NO_SVE-NEXT: .LBB7_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB7_64 +; NO_SVE-NEXT: .LBB7_49: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB7_65 +; NO_SVE-NEXT: .LBB7_50: // %else92 +; NO_SVE-NEXT: tbz x8, #32, .LBB7_66 +; NO_SVE-NEXT: .LBB7_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x8, #33, .LBB7_67 +; NO_SVE-NEXT: b .LBB7_68 +; NO_SVE-NEXT: .LBB7_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB7_38 +; NO_SVE-NEXT: .LBB7_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB7_39 +; NO_SVE-NEXT: .LBB7_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB7_40 +; NO_SVE-NEXT: .LBB7_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB7_41 +; NO_SVE-NEXT: .LBB7_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB7_42 +; NO_SVE-NEXT: .LBB7_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB7_43 +; NO_SVE-NEXT: .LBB7_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB7_44 +; NO_SVE-NEXT: .LBB7_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB7_45 +; NO_SVE-NEXT: .LBB7_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB7_46 +; NO_SVE-NEXT: .LBB7_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB7_47 +; NO_SVE-NEXT: .LBB7_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB7_48 +; NO_SVE-NEXT: .LBB7_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB7_49 +; NO_SVE-NEXT: .LBB7_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB7_50 +; NO_SVE-NEXT: .LBB7_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x8, #32, .LBB7_51 +; NO_SVE-NEXT: .LBB7_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x8, #33, .LBB7_68 +; NO_SVE-NEXT: .LBB7_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_68: // %else98 +; NO_SVE-NEXT: tbnz x8, #34, .LBB7_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x8, #35, .LBB7_85 +; NO_SVE-NEXT: .LBB7_70: // %else104 +; NO_SVE-NEXT: tbnz x8, #36, .LBB7_86 +; NO_SVE-NEXT: .LBB7_71: // %else107 +; NO_SVE-NEXT: tbnz x8, #37, .LBB7_87 +; NO_SVE-NEXT: .LBB7_72: // %else110 +; NO_SVE-NEXT: tbnz x8, #38, .LBB7_88 +; NO_SVE-NEXT: .LBB7_73: // %else113 +; NO_SVE-NEXT: tbnz x8, #39, .LBB7_89 +; NO_SVE-NEXT: .LBB7_74: // %else116 +; NO_SVE-NEXT: tbnz x8, #40, .LBB7_90 +; NO_SVE-NEXT: .LBB7_75: // %else119 +; NO_SVE-NEXT: tbnz x8, #41, .LBB7_91 +; NO_SVE-NEXT: .LBB7_76: // %else122 +; NO_SVE-NEXT: tbnz x8, #42, .LBB7_92 +; NO_SVE-NEXT: .LBB7_77: // %else125 +; NO_SVE-NEXT: tbnz x8, #43, .LBB7_93 +; NO_SVE-NEXT: .LBB7_78: // %else128 +; NO_SVE-NEXT: tbnz x8, #44, .LBB7_94 +; NO_SVE-NEXT: .LBB7_79: // %else131 +; NO_SVE-NEXT: tbnz x8, #45, .LBB7_95 +; NO_SVE-NEXT: .LBB7_80: // %else134 +; NO_SVE-NEXT: tbnz x8, #46, .LBB7_96 +; NO_SVE-NEXT: .LBB7_81: // %else137 +; NO_SVE-NEXT: tbnz x8, #47, .LBB7_97 +; NO_SVE-NEXT: .LBB7_82: // %else140 +; NO_SVE-NEXT: tbz x8, #48, .LBB7_98 +; NO_SVE-NEXT: .LBB7_83: // %cond.load142 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x9] +; NO_SVE-NEXT: tbnz x8, #49, .LBB7_99 +; NO_SVE-NEXT: b .LBB7_100 +; NO_SVE-NEXT: .LBB7_84: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x8, #35, .LBB7_70 +; NO_SVE-NEXT: .LBB7_85: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x8, #36, .LBB7_71 +; NO_SVE-NEXT: .LBB7_86: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x8, #37, .LBB7_72 +; NO_SVE-NEXT: .LBB7_87: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x8, #38, .LBB7_73 +; NO_SVE-NEXT: .LBB7_88: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x8, #39, .LBB7_74 +; NO_SVE-NEXT: .LBB7_89: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x8, #40, .LBB7_75 +; NO_SVE-NEXT: .LBB7_90: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz x8, #41, .LBB7_76 +; NO_SVE-NEXT: .LBB7_91: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz x8, #42, .LBB7_77 +; NO_SVE-NEXT: .LBB7_92: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz x8, #43, .LBB7_78 +; NO_SVE-NEXT: .LBB7_93: // %cond.load127 +; NO_SVE-NEXT: add x9, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz x8, #44, .LBB7_79 +; NO_SVE-NEXT: .LBB7_94: // %cond.load130 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz x8, #45, .LBB7_80 +; NO_SVE-NEXT: .LBB7_95: // %cond.load133 +; NO_SVE-NEXT: add x9, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz x8, #46, .LBB7_81 +; NO_SVE-NEXT: .LBB7_96: // %cond.load136 +; NO_SVE-NEXT: add x9, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbz x8, #47, .LBB7_82 +; NO_SVE-NEXT: .LBB7_97: // %cond.load139 +; NO_SVE-NEXT: add x9, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x9] +; NO_SVE-NEXT: tbnz x8, #48, .LBB7_83 +; NO_SVE-NEXT: .LBB7_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x8, #49, .LBB7_100 +; NO_SVE-NEXT: .LBB7_99: // %cond.load145 +; NO_SVE-NEXT: add x9, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_100: // %else146 +; NO_SVE-NEXT: tbnz x8, #50, .LBB7_115 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x8, #51, .LBB7_116 +; NO_SVE-NEXT: .LBB7_102: // %else152 +; NO_SVE-NEXT: tbnz x8, #52, .LBB7_117 +; NO_SVE-NEXT: .LBB7_103: // %else155 +; NO_SVE-NEXT: tbnz x8, #53, .LBB7_118 +; NO_SVE-NEXT: .LBB7_104: // %else158 +; NO_SVE-NEXT: tbnz x8, #54, .LBB7_119 +; NO_SVE-NEXT: .LBB7_105: // %else161 +; NO_SVE-NEXT: tbnz x8, #55, .LBB7_120 +; NO_SVE-NEXT: .LBB7_106: // %else164 +; NO_SVE-NEXT: tbnz x8, #56, .LBB7_121 +; NO_SVE-NEXT: .LBB7_107: // %else167 +; NO_SVE-NEXT: tbnz x8, #57, .LBB7_122 +; NO_SVE-NEXT: .LBB7_108: // %else170 +; NO_SVE-NEXT: tbnz x8, #58, .LBB7_123 +; NO_SVE-NEXT: .LBB7_109: // %else173 +; NO_SVE-NEXT: tbnz x8, #59, .LBB7_124 +; NO_SVE-NEXT: .LBB7_110: // %else176 +; NO_SVE-NEXT: tbnz x8, #60, .LBB7_125 +; NO_SVE-NEXT: .LBB7_111: // %else179 +; NO_SVE-NEXT: tbnz x8, #61, .LBB7_126 +; NO_SVE-NEXT: .LBB7_112: // %else182 +; NO_SVE-NEXT: tbnz x8, #62, .LBB7_127 +; NO_SVE-NEXT: .LBB7_113: // %else185 +; NO_SVE-NEXT: tbnz x8, #63, .LBB7_128 +; NO_SVE-NEXT: .LBB7_114: // %else188 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB7_115: // %cond.load148 +; NO_SVE-NEXT: add x9, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x9] +; NO_SVE-NEXT: tbz x8, #51, .LBB7_102 +; NO_SVE-NEXT: .LBB7_116: // %cond.load151 +; NO_SVE-NEXT: add x9, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x9] +; NO_SVE-NEXT: tbz x8, #52, .LBB7_103 +; NO_SVE-NEXT: .LBB7_117: // %cond.load154 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x9] +; NO_SVE-NEXT: tbz x8, #53, .LBB7_104 +; NO_SVE-NEXT: .LBB7_118: // %cond.load157 +; NO_SVE-NEXT: add x9, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x9] +; NO_SVE-NEXT: tbz x8, #54, .LBB7_105 +; NO_SVE-NEXT: .LBB7_119: // %cond.load160 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x9] +; NO_SVE-NEXT: tbz x8, #55, .LBB7_106 +; NO_SVE-NEXT: .LBB7_120: // %cond.load163 +; NO_SVE-NEXT: add x9, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x9] +; NO_SVE-NEXT: tbz x8, #56, .LBB7_107 +; NO_SVE-NEXT: .LBB7_121: // %cond.load166 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x9] +; NO_SVE-NEXT: tbz x8, #57, .LBB7_108 +; NO_SVE-NEXT: .LBB7_122: // %cond.load169 +; NO_SVE-NEXT: add x9, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x9] +; NO_SVE-NEXT: tbz x8, #58, .LBB7_109 +; NO_SVE-NEXT: .LBB7_123: // %cond.load172 +; NO_SVE-NEXT: add x9, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x9] +; NO_SVE-NEXT: tbz x8, #59, .LBB7_110 +; NO_SVE-NEXT: .LBB7_124: // %cond.load175 +; NO_SVE-NEXT: add x9, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x9] +; NO_SVE-NEXT: tbz x8, #60, .LBB7_111 +; NO_SVE-NEXT: .LBB7_125: // %cond.load178 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x9] +; NO_SVE-NEXT: tbz x8, #61, .LBB7_112 +; NO_SVE-NEXT: .LBB7_126: // %cond.load181 +; NO_SVE-NEXT: add x9, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x9] +; NO_SVE-NEXT: tbz x8, #62, .LBB7_113 +; NO_SVE-NEXT: .LBB7_127: // %cond.load184 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x9] +; NO_SVE-NEXT: tbz x8, #63, .LBB7_114 +; NO_SVE-NEXT: .LBB7_128: // %cond.load187 +; NO_SVE-NEXT: add x8, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -176,6 +2125,314 @@ } define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: ldp q1, q3, [x1, #32] +; NO_SVE-NEXT: cmeq v1.8h, v0.8h, v1.8h +; NO_SVE-NEXT: xtn v5.8b, v1.8h +; NO_SVE-NEXT: cmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: umov w8, v5.b[1] +; NO_SVE-NEXT: umov w9, v5.b[2] +; NO_SVE-NEXT: umov w10, v5.b[0] +; NO_SVE-NEXT: umov w11, v5.b[3] +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: umov w13, v5.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: ldp q4, q0, [x0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v5.b[7] +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w12, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: cmeq v3.8h, v4.8h, v3.8h +; NO_SVE-NEXT: bfi w10, w13, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #7 +; NO_SVE-NEXT: umov w13, v3.b[1] +; NO_SVE-NEXT: umov w14, v3.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #9 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #10 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: umov w15, v3.b[4] +; NO_SVE-NEXT: umov w16, v3.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[6] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #2, #1 +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #6 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB8_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB8_3 +; NO_SVE-NEXT: b .LBB8_4 +; NO_SVE-NEXT: .LBB8_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB8_4 +; NO_SVE-NEXT: .LBB8_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB8_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB8_13 +; NO_SVE-NEXT: .LBB8_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB8_14 +; NO_SVE-NEXT: .LBB8_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB8_15 +; NO_SVE-NEXT: .LBB8_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB8_16 +; NO_SVE-NEXT: .LBB8_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB8_17 +; NO_SVE-NEXT: .LBB8_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB8_18 +; NO_SVE-NEXT: .LBB8_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB8_19 +; NO_SVE-NEXT: b .LBB8_20 +; NO_SVE-NEXT: .LBB8_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB8_6 +; NO_SVE-NEXT: .LBB8_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB8_7 +; NO_SVE-NEXT: .LBB8_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB8_8 +; NO_SVE-NEXT: .LBB8_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB8_9 +; NO_SVE-NEXT: .LBB8_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB8_10 +; NO_SVE-NEXT: .LBB8_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB8_11 +; NO_SVE-NEXT: .LBB8_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB8_20 +; NO_SVE-NEXT: .LBB8_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB8_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB8_29 +; NO_SVE-NEXT: .LBB8_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB8_30 +; NO_SVE-NEXT: .LBB8_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB8_31 +; NO_SVE-NEXT: .LBB8_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB8_32 +; NO_SVE-NEXT: .LBB8_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB8_33 +; NO_SVE-NEXT: .LBB8_26: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB8_34 +; NO_SVE-NEXT: .LBB8_27: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB8_35 +; NO_SVE-NEXT: b .LBB8_36 +; NO_SVE-NEXT: .LBB8_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB8_22 +; NO_SVE-NEXT: .LBB8_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB8_23 +; NO_SVE-NEXT: .LBB8_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB8_24 +; NO_SVE-NEXT: .LBB8_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB8_25 +; NO_SVE-NEXT: .LBB8_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB8_26 +; NO_SVE-NEXT: .LBB8_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB8_27 +; NO_SVE-NEXT: .LBB8_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB8_36 +; NO_SVE-NEXT: .LBB8_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB8_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB8_45 +; NO_SVE-NEXT: .LBB8_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB8_46 +; NO_SVE-NEXT: .LBB8_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB8_47 +; NO_SVE-NEXT: .LBB8_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB8_48 +; NO_SVE-NEXT: .LBB8_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB8_49 +; NO_SVE-NEXT: .LBB8_42: // %else68 +; NO_SVE-NEXT: tbz w8, #24, .LBB8_50 +; NO_SVE-NEXT: .LBB8_43: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB8_51 +; NO_SVE-NEXT: b .LBB8_52 +; NO_SVE-NEXT: .LBB8_44: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB8_38 +; NO_SVE-NEXT: .LBB8_45: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB8_39 +; NO_SVE-NEXT: .LBB8_46: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB8_40 +; NO_SVE-NEXT: .LBB8_47: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB8_41 +; NO_SVE-NEXT: .LBB8_48: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB8_42 +; NO_SVE-NEXT: .LBB8_49: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #24, .LBB8_43 +; NO_SVE-NEXT: .LBB8_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB8_52 +; NO_SVE-NEXT: .LBB8_51: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_52: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB8_59 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB8_60 +; NO_SVE-NEXT: .LBB8_54: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB8_61 +; NO_SVE-NEXT: .LBB8_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB8_62 +; NO_SVE-NEXT: .LBB8_56: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB8_63 +; NO_SVE-NEXT: .LBB8_57: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB8_64 +; NO_SVE-NEXT: .LBB8_58: // %else92 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB8_59: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB8_54 +; NO_SVE-NEXT: .LBB8_60: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB8_55 +; NO_SVE-NEXT: .LBB8_61: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB8_56 +; NO_SVE-NEXT: .LBB8_62: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB8_57 +; NO_SVE-NEXT: .LBB8_63: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB8_58 +; NO_SVE-NEXT: .LBB8_64: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -193,6 +2450,172 @@ } define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, v3.4s +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q4, q1, [x1, #32] +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v2.4s, v1.4s +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v3.4s, v4.4s +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB9_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB9_3 +; NO_SVE-NEXT: b .LBB9_4 +; NO_SVE-NEXT: .LBB9_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB9_4 +; NO_SVE-NEXT: .LBB9_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB9_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB9_9 +; NO_SVE-NEXT: .LBB9_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB9_10 +; NO_SVE-NEXT: .LBB9_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB9_11 +; NO_SVE-NEXT: b .LBB9_12 +; NO_SVE-NEXT: .LBB9_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB9_6 +; NO_SVE-NEXT: .LBB9_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB9_7 +; NO_SVE-NEXT: .LBB9_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB9_12 +; NO_SVE-NEXT: .LBB9_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB9_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB9_17 +; NO_SVE-NEXT: .LBB9_14: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB9_18 +; NO_SVE-NEXT: .LBB9_15: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB9_19 +; NO_SVE-NEXT: b .LBB9_20 +; NO_SVE-NEXT: .LBB9_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB9_14 +; NO_SVE-NEXT: .LBB9_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB9_15 +; NO_SVE-NEXT: .LBB9_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB9_20 +; NO_SVE-NEXT: .LBB9_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB9_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB9_25 +; NO_SVE-NEXT: .LBB9_22: // %else32 +; NO_SVE-NEXT: tbz w8, #12, .LBB9_26 +; NO_SVE-NEXT: .LBB9_23: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB9_27 +; NO_SVE-NEXT: b .LBB9_28 +; NO_SVE-NEXT: .LBB9_24: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB9_22 +; NO_SVE-NEXT: .LBB9_25: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #12, .LBB9_23 +; NO_SVE-NEXT: .LBB9_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB9_28 +; NO_SVE-NEXT: .LBB9_27: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_28: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB9_31 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB9_32 +; NO_SVE-NEXT: .LBB9_30: // %else44 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB9_31: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB9_30 +; NO_SVE-NEXT: .LBB9_32: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -210,6 +2633,99 @@ } define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x1, #32] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x1] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, v5.2d +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, v6.2d +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, v7.2d +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB10_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB10_3 +; NO_SVE-NEXT: b .LBB10_4 +; NO_SVE-NEXT: .LBB10_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB10_4 +; NO_SVE-NEXT: .LBB10_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_4: // %else2 +; NO_SVE-NEXT: tbz w8, #2, .LBB10_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB10_7 +; NO_SVE-NEXT: b .LBB10_8 +; NO_SVE-NEXT: .LBB10_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB10_8 +; NO_SVE-NEXT: .LBB10_7: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_8: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB10_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB10_11 +; NO_SVE-NEXT: b .LBB10_12 +; NO_SVE-NEXT: .LBB10_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB10_12 +; NO_SVE-NEXT: .LBB10_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_12: // %else14 +; NO_SVE-NEXT: tbz w8, #6, .LBB10_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB10_15 +; NO_SVE-NEXT: b .LBB10_16 +; NO_SVE-NEXT: .LBB10_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB10_16 +; NO_SVE-NEXT: .LBB10_15: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB10_16: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -227,6 +2743,96 @@ } define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_passthru_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: cmeq v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v7.2d, v7.2d, v3.2d +; NO_SVE-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, v0.2d +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, v1.2d +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB11_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB11_10 +; NO_SVE-NEXT: .LBB11_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB11_11 +; NO_SVE-NEXT: .LBB11_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB11_12 +; NO_SVE-NEXT: .LBB11_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB11_13 +; NO_SVE-NEXT: .LBB11_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB11_14 +; NO_SVE-NEXT: .LBB11_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB11_15 +; NO_SVE-NEXT: .LBB11_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB11_16 +; NO_SVE-NEXT: .LBB11_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB11_9: // %cond.load +; NO_SVE-NEXT: ld1 { v0.d }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB11_2 +; NO_SVE-NEXT: .LBB11_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB11_3 +; NO_SVE-NEXT: .LBB11_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB11_4 +; NO_SVE-NEXT: .LBB11_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB11_5 +; NO_SVE-NEXT: .LBB11_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB11_6 +; NO_SVE-NEXT: .LBB11_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB11_7 +; NO_SVE-NEXT: .LBB11_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB11_8 +; NO_SVE-NEXT: .LBB11_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -245,6 +2851,96 @@ } define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 { +; NO_SVE-LABEL: masked_load_passthru_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, v3.2d +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v0.2d +; NO_SVE-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, v1.2d +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB12_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB12_10 +; NO_SVE-NEXT: .LBB12_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB12_11 +; NO_SVE-NEXT: .LBB12_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB12_12 +; NO_SVE-NEXT: .LBB12_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB12_13 +; NO_SVE-NEXT: .LBB12_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB12_14 +; NO_SVE-NEXT: .LBB12_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB12_15 +; NO_SVE-NEXT: .LBB12_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB12_16 +; NO_SVE-NEXT: .LBB12_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB12_9: // %cond.load +; NO_SVE-NEXT: ld1 { v0.d }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB12_2 +; NO_SVE-NEXT: .LBB12_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB12_3 +; NO_SVE-NEXT: .LBB12_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB12_4 +; NO_SVE-NEXT: .LBB12_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB12_5 +; NO_SVE-NEXT: .LBB12_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB12_6 +; NO_SVE-NEXT: .LBB12_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB12_7 +; NO_SVE-NEXT: .LBB12_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB12_8 +; NO_SVE-NEXT: .LBB12_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -263,6 +2959,306 @@ } define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: umov w8, v0.b[10] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[11] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v1.b[2] +; NO_SVE-NEXT: bfi w12, w10, #1, #1 +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #9 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w14, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w12, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: bfi w12, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v0.b[14] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[8] +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w12, w9, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB13_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB13_3 +; NO_SVE-NEXT: b .LBB13_4 +; NO_SVE-NEXT: .LBB13_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB13_4 +; NO_SVE-NEXT: .LBB13_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB13_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB13_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB13_21 +; NO_SVE-NEXT: .LBB13_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB13_22 +; NO_SVE-NEXT: .LBB13_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB13_23 +; NO_SVE-NEXT: .LBB13_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB13_24 +; NO_SVE-NEXT: .LBB13_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB13_25 +; NO_SVE-NEXT: .LBB13_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB13_26 +; NO_SVE-NEXT: .LBB13_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB13_27 +; NO_SVE-NEXT: .LBB13_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB13_28 +; NO_SVE-NEXT: .LBB13_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB13_29 +; NO_SVE-NEXT: .LBB13_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB13_30 +; NO_SVE-NEXT: .LBB13_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB13_31 +; NO_SVE-NEXT: .LBB13_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB13_32 +; NO_SVE-NEXT: .LBB13_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB13_33 +; NO_SVE-NEXT: .LBB13_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB13_34 +; NO_SVE-NEXT: .LBB13_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB13_35 +; NO_SVE-NEXT: b .LBB13_36 +; NO_SVE-NEXT: .LBB13_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB13_6 +; NO_SVE-NEXT: .LBB13_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB13_7 +; NO_SVE-NEXT: .LBB13_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB13_8 +; NO_SVE-NEXT: .LBB13_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB13_9 +; NO_SVE-NEXT: .LBB13_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB13_10 +; NO_SVE-NEXT: .LBB13_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB13_11 +; NO_SVE-NEXT: .LBB13_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB13_12 +; NO_SVE-NEXT: .LBB13_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB13_13 +; NO_SVE-NEXT: .LBB13_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB13_14 +; NO_SVE-NEXT: .LBB13_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB13_15 +; NO_SVE-NEXT: .LBB13_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB13_16 +; NO_SVE-NEXT: .LBB13_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB13_17 +; NO_SVE-NEXT: .LBB13_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB13_18 +; NO_SVE-NEXT: .LBB13_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB13_19 +; NO_SVE-NEXT: .LBB13_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB13_36 +; NO_SVE-NEXT: .LBB13_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB13_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB13_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB13_53 +; NO_SVE-NEXT: .LBB13_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB13_54 +; NO_SVE-NEXT: .LBB13_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB13_55 +; NO_SVE-NEXT: .LBB13_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB13_56 +; NO_SVE-NEXT: .LBB13_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB13_57 +; NO_SVE-NEXT: .LBB13_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB13_58 +; NO_SVE-NEXT: .LBB13_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB13_59 +; NO_SVE-NEXT: .LBB13_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB13_60 +; NO_SVE-NEXT: .LBB13_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB13_61 +; NO_SVE-NEXT: .LBB13_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB13_62 +; NO_SVE-NEXT: .LBB13_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB13_63 +; NO_SVE-NEXT: .LBB13_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB13_64 +; NO_SVE-NEXT: .LBB13_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB13_51 +; NO_SVE-NEXT: .LBB13_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB13_51: // %else92 +; NO_SVE-NEXT: sshll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB13_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB13_38 +; NO_SVE-NEXT: .LBB13_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB13_39 +; NO_SVE-NEXT: .LBB13_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB13_40 +; NO_SVE-NEXT: .LBB13_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB13_41 +; NO_SVE-NEXT: .LBB13_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB13_42 +; NO_SVE-NEXT: .LBB13_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB13_43 +; NO_SVE-NEXT: .LBB13_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB13_44 +; NO_SVE-NEXT: .LBB13_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB13_45 +; NO_SVE-NEXT: .LBB13_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB13_46 +; NO_SVE-NEXT: .LBB13_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB13_47 +; NO_SVE-NEXT: .LBB13_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB13_48 +; NO_SVE-NEXT: .LBB13_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB13_49 +; NO_SVE-NEXT: .LBB13_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB13_50 +; NO_SVE-NEXT: b .LBB13_51 +; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -279,6 +3275,164 @@ } define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB14_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB14_19 +; NO_SVE-NEXT: .LBB14_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB14_20 +; NO_SVE-NEXT: .LBB14_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB14_21 +; NO_SVE-NEXT: .LBB14_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB14_22 +; NO_SVE-NEXT: .LBB14_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB14_23 +; NO_SVE-NEXT: .LBB14_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB14_24 +; NO_SVE-NEXT: .LBB14_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB14_25 +; NO_SVE-NEXT: .LBB14_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB14_26 +; NO_SVE-NEXT: .LBB14_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB14_27 +; NO_SVE-NEXT: .LBB14_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB14_28 +; NO_SVE-NEXT: .LBB14_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB14_29 +; NO_SVE-NEXT: .LBB14_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB14_30 +; NO_SVE-NEXT: .LBB14_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB14_31 +; NO_SVE-NEXT: .LBB14_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB14_32 +; NO_SVE-NEXT: .LBB14_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB14_17 +; NO_SVE-NEXT: .LBB14_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB14_17: // %else44 +; NO_SVE-NEXT: sshll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB14_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB14_2 +; NO_SVE-NEXT: .LBB14_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB14_3 +; NO_SVE-NEXT: .LBB14_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB14_4 +; NO_SVE-NEXT: .LBB14_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB14_5 +; NO_SVE-NEXT: .LBB14_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB14_6 +; NO_SVE-NEXT: .LBB14_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB14_7 +; NO_SVE-NEXT: .LBB14_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB14_8 +; NO_SVE-NEXT: .LBB14_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB14_9 +; NO_SVE-NEXT: .LBB14_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB14_10 +; NO_SVE-NEXT: .LBB14_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB14_11 +; NO_SVE-NEXT: .LBB14_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB14_12 +; NO_SVE-NEXT: .LBB14_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB14_13 +; NO_SVE-NEXT: .LBB14_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB14_14 +; NO_SVE-NEXT: .LBB14_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB14_15 +; NO_SVE-NEXT: .LBB14_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB14_16 +; NO_SVE-NEXT: b .LBB14_17 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -295,6 +3449,93 @@ } define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB15_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB15_11 +; NO_SVE-NEXT: .LBB15_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB15_12 +; NO_SVE-NEXT: .LBB15_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB15_13 +; NO_SVE-NEXT: .LBB15_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB15_14 +; NO_SVE-NEXT: .LBB15_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB15_15 +; NO_SVE-NEXT: .LBB15_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB15_16 +; NO_SVE-NEXT: .LBB15_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB15_9 +; NO_SVE-NEXT: .LBB15_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB15_9: // %else20 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB15_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB15_2 +; NO_SVE-NEXT: .LBB15_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB15_3 +; NO_SVE-NEXT: .LBB15_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB15_4 +; NO_SVE-NEXT: .LBB15_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB15_5 +; NO_SVE-NEXT: .LBB15_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB15_6 +; NO_SVE-NEXT: .LBB15_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB15_7 +; NO_SVE-NEXT: .LBB15_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB15_8 +; NO_SVE-NEXT: b .LBB15_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -311,6 +3552,166 @@ } define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB16_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB16_3 +; NO_SVE-NEXT: b .LBB16_4 +; NO_SVE-NEXT: .LBB16_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB16_4 +; NO_SVE-NEXT: .LBB16_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB16_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB16_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB16_13 +; NO_SVE-NEXT: .LBB16_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB16_14 +; NO_SVE-NEXT: .LBB16_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB16_15 +; NO_SVE-NEXT: .LBB16_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB16_16 +; NO_SVE-NEXT: .LBB16_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB16_17 +; NO_SVE-NEXT: .LBB16_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB16_18 +; NO_SVE-NEXT: .LBB16_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB16_19 +; NO_SVE-NEXT: b .LBB16_20 +; NO_SVE-NEXT: .LBB16_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB16_6 +; NO_SVE-NEXT: .LBB16_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB16_7 +; NO_SVE-NEXT: .LBB16_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB16_8 +; NO_SVE-NEXT: .LBB16_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB16_9 +; NO_SVE-NEXT: .LBB16_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB16_10 +; NO_SVE-NEXT: .LBB16_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB16_11 +; NO_SVE-NEXT: .LBB16_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB16_20 +; NO_SVE-NEXT: .LBB16_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB16_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB16_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB16_29 +; NO_SVE-NEXT: .LBB16_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB16_30 +; NO_SVE-NEXT: .LBB16_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB16_31 +; NO_SVE-NEXT: .LBB16_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB16_32 +; NO_SVE-NEXT: .LBB16_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB16_27 +; NO_SVE-NEXT: .LBB16_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB16_27: // %else44 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB16_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB16_22 +; NO_SVE-NEXT: .LBB16_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB16_23 +; NO_SVE-NEXT: .LBB16_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB16_24 +; NO_SVE-NEXT: .LBB16_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB16_25 +; NO_SVE-NEXT: .LBB16_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB16_26 +; NO_SVE-NEXT: b .LBB16_27 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -327,6 +3728,93 @@ } define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB17_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB17_11 +; NO_SVE-NEXT: .LBB17_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB17_12 +; NO_SVE-NEXT: .LBB17_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB17_13 +; NO_SVE-NEXT: .LBB17_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB17_14 +; NO_SVE-NEXT: .LBB17_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB17_15 +; NO_SVE-NEXT: .LBB17_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB17_16 +; NO_SVE-NEXT: .LBB17_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB17_9 +; NO_SVE-NEXT: .LBB17_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB17_9: // %else20 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB17_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB17_2 +; NO_SVE-NEXT: .LBB17_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB17_3 +; NO_SVE-NEXT: .LBB17_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB17_4 +; NO_SVE-NEXT: .LBB17_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB17_5 +; NO_SVE-NEXT: .LBB17_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB17_6 +; NO_SVE-NEXT: .LBB17_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB17_7 +; NO_SVE-NEXT: .LBB17_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB17_8 +; NO_SVE-NEXT: b .LBB17_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -343,6 +3831,94 @@ } define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB18_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB18_3 +; NO_SVE-NEXT: b .LBB18_4 +; NO_SVE-NEXT: .LBB18_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB18_4 +; NO_SVE-NEXT: .LBB18_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB18_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB18_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB18_9 +; NO_SVE-NEXT: .LBB18_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB18_10 +; NO_SVE-NEXT: .LBB18_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB18_11 +; NO_SVE-NEXT: b .LBB18_12 +; NO_SVE-NEXT: .LBB18_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB18_6 +; NO_SVE-NEXT: .LBB18_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB18_7 +; NO_SVE-NEXT: .LBB18_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB18_12 +; NO_SVE-NEXT: .LBB18_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB18_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB18_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB18_15 +; NO_SVE-NEXT: .LBB18_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB18_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB18_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB18_14 +; NO_SVE-NEXT: b .LBB18_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -359,6 +3935,306 @@ } define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: umov w8, v0.b[10] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[11] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v1.b[2] +; NO_SVE-NEXT: bfi w12, w10, #1, #1 +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #9 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w14, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w12, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: bfi w12, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v0.b[14] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[8] +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w12, w9, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB19_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB19_3 +; NO_SVE-NEXT: b .LBB19_4 +; NO_SVE-NEXT: .LBB19_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB19_4 +; NO_SVE-NEXT: .LBB19_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB19_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB19_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB19_21 +; NO_SVE-NEXT: .LBB19_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB19_22 +; NO_SVE-NEXT: .LBB19_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB19_23 +; NO_SVE-NEXT: .LBB19_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB19_24 +; NO_SVE-NEXT: .LBB19_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB19_25 +; NO_SVE-NEXT: .LBB19_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB19_26 +; NO_SVE-NEXT: .LBB19_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB19_27 +; NO_SVE-NEXT: .LBB19_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB19_28 +; NO_SVE-NEXT: .LBB19_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB19_29 +; NO_SVE-NEXT: .LBB19_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB19_30 +; NO_SVE-NEXT: .LBB19_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB19_31 +; NO_SVE-NEXT: .LBB19_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB19_32 +; NO_SVE-NEXT: .LBB19_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB19_33 +; NO_SVE-NEXT: .LBB19_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB19_34 +; NO_SVE-NEXT: .LBB19_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB19_35 +; NO_SVE-NEXT: b .LBB19_36 +; NO_SVE-NEXT: .LBB19_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB19_6 +; NO_SVE-NEXT: .LBB19_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB19_7 +; NO_SVE-NEXT: .LBB19_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB19_8 +; NO_SVE-NEXT: .LBB19_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB19_9 +; NO_SVE-NEXT: .LBB19_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB19_10 +; NO_SVE-NEXT: .LBB19_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB19_11 +; NO_SVE-NEXT: .LBB19_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB19_12 +; NO_SVE-NEXT: .LBB19_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB19_13 +; NO_SVE-NEXT: .LBB19_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB19_14 +; NO_SVE-NEXT: .LBB19_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB19_15 +; NO_SVE-NEXT: .LBB19_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB19_16 +; NO_SVE-NEXT: .LBB19_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB19_17 +; NO_SVE-NEXT: .LBB19_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB19_18 +; NO_SVE-NEXT: .LBB19_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB19_19 +; NO_SVE-NEXT: .LBB19_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB19_36 +; NO_SVE-NEXT: .LBB19_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB19_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB19_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB19_53 +; NO_SVE-NEXT: .LBB19_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB19_54 +; NO_SVE-NEXT: .LBB19_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB19_55 +; NO_SVE-NEXT: .LBB19_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB19_56 +; NO_SVE-NEXT: .LBB19_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB19_57 +; NO_SVE-NEXT: .LBB19_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB19_58 +; NO_SVE-NEXT: .LBB19_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB19_59 +; NO_SVE-NEXT: .LBB19_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB19_60 +; NO_SVE-NEXT: .LBB19_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB19_61 +; NO_SVE-NEXT: .LBB19_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB19_62 +; NO_SVE-NEXT: .LBB19_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB19_63 +; NO_SVE-NEXT: .LBB19_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB19_64 +; NO_SVE-NEXT: .LBB19_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB19_51 +; NO_SVE-NEXT: .LBB19_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB19_51: // %else92 +; NO_SVE-NEXT: ushll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB19_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB19_38 +; NO_SVE-NEXT: .LBB19_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB19_39 +; NO_SVE-NEXT: .LBB19_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB19_40 +; NO_SVE-NEXT: .LBB19_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB19_41 +; NO_SVE-NEXT: .LBB19_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB19_42 +; NO_SVE-NEXT: .LBB19_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB19_43 +; NO_SVE-NEXT: .LBB19_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB19_44 +; NO_SVE-NEXT: .LBB19_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB19_45 +; NO_SVE-NEXT: .LBB19_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB19_46 +; NO_SVE-NEXT: .LBB19_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB19_47 +; NO_SVE-NEXT: .LBB19_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB19_48 +; NO_SVE-NEXT: .LBB19_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB19_49 +; NO_SVE-NEXT: .LBB19_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB19_50 +; NO_SVE-NEXT: b .LBB19_51 +; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -375,6 +4251,164 @@ } define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB20_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB20_19 +; NO_SVE-NEXT: .LBB20_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB20_20 +; NO_SVE-NEXT: .LBB20_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB20_21 +; NO_SVE-NEXT: .LBB20_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB20_22 +; NO_SVE-NEXT: .LBB20_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB20_23 +; NO_SVE-NEXT: .LBB20_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB20_24 +; NO_SVE-NEXT: .LBB20_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB20_25 +; NO_SVE-NEXT: .LBB20_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB20_26 +; NO_SVE-NEXT: .LBB20_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB20_27 +; NO_SVE-NEXT: .LBB20_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB20_28 +; NO_SVE-NEXT: .LBB20_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB20_29 +; NO_SVE-NEXT: .LBB20_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB20_30 +; NO_SVE-NEXT: .LBB20_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB20_31 +; NO_SVE-NEXT: .LBB20_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB20_32 +; NO_SVE-NEXT: .LBB20_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB20_17 +; NO_SVE-NEXT: .LBB20_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB20_17: // %else44 +; NO_SVE-NEXT: ushll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB20_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB20_2 +; NO_SVE-NEXT: .LBB20_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB20_3 +; NO_SVE-NEXT: .LBB20_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB20_4 +; NO_SVE-NEXT: .LBB20_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB20_5 +; NO_SVE-NEXT: .LBB20_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB20_6 +; NO_SVE-NEXT: .LBB20_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB20_7 +; NO_SVE-NEXT: .LBB20_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB20_8 +; NO_SVE-NEXT: .LBB20_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB20_9 +; NO_SVE-NEXT: .LBB20_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB20_10 +; NO_SVE-NEXT: .LBB20_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB20_11 +; NO_SVE-NEXT: .LBB20_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB20_12 +; NO_SVE-NEXT: .LBB20_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB20_13 +; NO_SVE-NEXT: .LBB20_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB20_14 +; NO_SVE-NEXT: .LBB20_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB20_15 +; NO_SVE-NEXT: .LBB20_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB20_16 +; NO_SVE-NEXT: b .LBB20_17 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -391,6 +4425,93 @@ } define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB21_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB21_11 +; NO_SVE-NEXT: .LBB21_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB21_12 +; NO_SVE-NEXT: .LBB21_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB21_13 +; NO_SVE-NEXT: .LBB21_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB21_14 +; NO_SVE-NEXT: .LBB21_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB21_15 +; NO_SVE-NEXT: .LBB21_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB21_16 +; NO_SVE-NEXT: .LBB21_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB21_9 +; NO_SVE-NEXT: .LBB21_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB21_9: // %else20 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB21_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB21_2 +; NO_SVE-NEXT: .LBB21_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB21_3 +; NO_SVE-NEXT: .LBB21_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB21_4 +; NO_SVE-NEXT: .LBB21_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB21_5 +; NO_SVE-NEXT: .LBB21_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB21_6 +; NO_SVE-NEXT: .LBB21_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB21_7 +; NO_SVE-NEXT: .LBB21_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB21_8 +; NO_SVE-NEXT: b .LBB21_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -407,6 +4528,166 @@ } define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB22_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB22_3 +; NO_SVE-NEXT: b .LBB22_4 +; NO_SVE-NEXT: .LBB22_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB22_4 +; NO_SVE-NEXT: .LBB22_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB22_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB22_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB22_13 +; NO_SVE-NEXT: .LBB22_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB22_14 +; NO_SVE-NEXT: .LBB22_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB22_15 +; NO_SVE-NEXT: .LBB22_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB22_16 +; NO_SVE-NEXT: .LBB22_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB22_17 +; NO_SVE-NEXT: .LBB22_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB22_18 +; NO_SVE-NEXT: .LBB22_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB22_19 +; NO_SVE-NEXT: b .LBB22_20 +; NO_SVE-NEXT: .LBB22_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB22_6 +; NO_SVE-NEXT: .LBB22_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB22_7 +; NO_SVE-NEXT: .LBB22_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB22_8 +; NO_SVE-NEXT: .LBB22_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB22_9 +; NO_SVE-NEXT: .LBB22_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB22_10 +; NO_SVE-NEXT: .LBB22_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB22_11 +; NO_SVE-NEXT: .LBB22_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB22_20 +; NO_SVE-NEXT: .LBB22_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB22_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB22_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB22_29 +; NO_SVE-NEXT: .LBB22_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB22_30 +; NO_SVE-NEXT: .LBB22_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB22_31 +; NO_SVE-NEXT: .LBB22_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB22_32 +; NO_SVE-NEXT: .LBB22_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB22_27 +; NO_SVE-NEXT: .LBB22_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB22_27: // %else44 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB22_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB22_22 +; NO_SVE-NEXT: .LBB22_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB22_23 +; NO_SVE-NEXT: .LBB22_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB22_24 +; NO_SVE-NEXT: .LBB22_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB22_25 +; NO_SVE-NEXT: .LBB22_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB22_26 +; NO_SVE-NEXT: b .LBB22_27 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -423,6 +4704,93 @@ } define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB23_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB23_11 +; NO_SVE-NEXT: .LBB23_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB23_12 +; NO_SVE-NEXT: .LBB23_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB23_13 +; NO_SVE-NEXT: .LBB23_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB23_14 +; NO_SVE-NEXT: .LBB23_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB23_15 +; NO_SVE-NEXT: .LBB23_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB23_16 +; NO_SVE-NEXT: .LBB23_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB23_9 +; NO_SVE-NEXT: .LBB23_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB23_9: // %else20 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB23_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB23_2 +; NO_SVE-NEXT: .LBB23_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB23_3 +; NO_SVE-NEXT: .LBB23_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB23_4 +; NO_SVE-NEXT: .LBB23_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB23_5 +; NO_SVE-NEXT: .LBB23_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB23_6 +; NO_SVE-NEXT: .LBB23_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB23_7 +; NO_SVE-NEXT: .LBB23_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB23_8 +; NO_SVE-NEXT: b .LBB23_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -439,6 +4807,94 @@ } define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB24_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB24_3 +; NO_SVE-NEXT: b .LBB24_4 +; NO_SVE-NEXT: .LBB24_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB24_4 +; NO_SVE-NEXT: .LBB24_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB24_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB24_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB24_9 +; NO_SVE-NEXT: .LBB24_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB24_10 +; NO_SVE-NEXT: .LBB24_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB24_11 +; NO_SVE-NEXT: b .LBB24_12 +; NO_SVE-NEXT: .LBB24_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB24_6 +; NO_SVE-NEXT: .LBB24_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB24_7 +; NO_SVE-NEXT: .LBB24_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB24_12 +; NO_SVE-NEXT: .LBB24_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB24_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB24_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB24_15 +; NO_SVE-NEXT: .LBB24_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB24_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB24_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB24_14 +; NO_SVE-NEXT: b .LBB24_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -455,6 +4911,313 @@ } define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i16_m16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w11, w14, #1, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w16, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w11, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB25_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB25_3 +; NO_SVE-NEXT: b .LBB25_4 +; NO_SVE-NEXT: .LBB25_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB25_4 +; NO_SVE-NEXT: .LBB25_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB25_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB25_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB25_21 +; NO_SVE-NEXT: .LBB25_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB25_22 +; NO_SVE-NEXT: .LBB25_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB25_23 +; NO_SVE-NEXT: .LBB25_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB25_24 +; NO_SVE-NEXT: .LBB25_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB25_25 +; NO_SVE-NEXT: .LBB25_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB25_26 +; NO_SVE-NEXT: .LBB25_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB25_27 +; NO_SVE-NEXT: .LBB25_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB25_28 +; NO_SVE-NEXT: .LBB25_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB25_29 +; NO_SVE-NEXT: .LBB25_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB25_30 +; NO_SVE-NEXT: .LBB25_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB25_31 +; NO_SVE-NEXT: .LBB25_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB25_32 +; NO_SVE-NEXT: .LBB25_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB25_33 +; NO_SVE-NEXT: .LBB25_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB25_34 +; NO_SVE-NEXT: .LBB25_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB25_35 +; NO_SVE-NEXT: b .LBB25_36 +; NO_SVE-NEXT: .LBB25_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB25_6 +; NO_SVE-NEXT: .LBB25_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB25_7 +; NO_SVE-NEXT: .LBB25_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB25_8 +; NO_SVE-NEXT: .LBB25_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB25_9 +; NO_SVE-NEXT: .LBB25_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB25_10 +; NO_SVE-NEXT: .LBB25_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB25_11 +; NO_SVE-NEXT: .LBB25_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB25_12 +; NO_SVE-NEXT: .LBB25_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB25_13 +; NO_SVE-NEXT: .LBB25_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB25_14 +; NO_SVE-NEXT: .LBB25_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB25_15 +; NO_SVE-NEXT: .LBB25_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB25_16 +; NO_SVE-NEXT: .LBB25_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB25_17 +; NO_SVE-NEXT: .LBB25_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB25_18 +; NO_SVE-NEXT: .LBB25_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB25_19 +; NO_SVE-NEXT: .LBB25_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB25_36 +; NO_SVE-NEXT: .LBB25_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB25_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB25_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB25_53 +; NO_SVE-NEXT: .LBB25_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB25_54 +; NO_SVE-NEXT: .LBB25_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB25_55 +; NO_SVE-NEXT: .LBB25_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB25_56 +; NO_SVE-NEXT: .LBB25_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB25_57 +; NO_SVE-NEXT: .LBB25_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB25_58 +; NO_SVE-NEXT: .LBB25_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB25_59 +; NO_SVE-NEXT: .LBB25_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB25_60 +; NO_SVE-NEXT: .LBB25_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB25_61 +; NO_SVE-NEXT: .LBB25_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB25_62 +; NO_SVE-NEXT: .LBB25_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB25_63 +; NO_SVE-NEXT: .LBB25_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB25_64 +; NO_SVE-NEXT: .LBB25_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB25_51 +; NO_SVE-NEXT: .LBB25_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB25_51: // %else92 +; NO_SVE-NEXT: sshll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB25_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB25_38 +; NO_SVE-NEXT: .LBB25_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB25_39 +; NO_SVE-NEXT: .LBB25_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB25_40 +; NO_SVE-NEXT: .LBB25_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB25_41 +; NO_SVE-NEXT: .LBB25_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB25_42 +; NO_SVE-NEXT: .LBB25_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB25_43 +; NO_SVE-NEXT: .LBB25_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB25_44 +; NO_SVE-NEXT: .LBB25_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB25_45 +; NO_SVE-NEXT: .LBB25_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB25_46 +; NO_SVE-NEXT: .LBB25_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB25_47 +; NO_SVE-NEXT: .LBB25_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB25_48 +; NO_SVE-NEXT: .LBB25_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB25_49 +; NO_SVE-NEXT: .LBB25_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB25_50 +; NO_SVE-NEXT: b .LBB25_51 +; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -471,6 +5234,172 @@ } define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i8i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB26_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB26_19 +; NO_SVE-NEXT: .LBB26_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB26_20 +; NO_SVE-NEXT: .LBB26_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB26_21 +; NO_SVE-NEXT: .LBB26_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB26_22 +; NO_SVE-NEXT: .LBB26_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB26_23 +; NO_SVE-NEXT: .LBB26_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB26_24 +; NO_SVE-NEXT: .LBB26_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB26_25 +; NO_SVE-NEXT: .LBB26_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB26_26 +; NO_SVE-NEXT: .LBB26_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB26_27 +; NO_SVE-NEXT: .LBB26_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB26_28 +; NO_SVE-NEXT: .LBB26_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB26_29 +; NO_SVE-NEXT: .LBB26_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB26_30 +; NO_SVE-NEXT: .LBB26_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB26_31 +; NO_SVE-NEXT: .LBB26_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB26_32 +; NO_SVE-NEXT: .LBB26_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB26_17 +; NO_SVE-NEXT: .LBB26_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB26_17: // %else44 +; NO_SVE-NEXT: sshll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB26_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB26_2 +; NO_SVE-NEXT: .LBB26_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB26_3 +; NO_SVE-NEXT: .LBB26_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB26_4 +; NO_SVE-NEXT: .LBB26_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB26_5 +; NO_SVE-NEXT: .LBB26_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB26_6 +; NO_SVE-NEXT: .LBB26_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB26_7 +; NO_SVE-NEXT: .LBB26_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB26_8 +; NO_SVE-NEXT: .LBB26_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB26_9 +; NO_SVE-NEXT: .LBB26_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB26_10 +; NO_SVE-NEXT: .LBB26_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB26_11 +; NO_SVE-NEXT: .LBB26_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB26_12 +; NO_SVE-NEXT: .LBB26_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB26_13 +; NO_SVE-NEXT: .LBB26_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB26_14 +; NO_SVE-NEXT: .LBB26_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB26_15 +; NO_SVE-NEXT: .LBB26_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB26_16 +; NO_SVE-NEXT: b .LBB26_17 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -487,6 +5416,101 @@ } define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i8i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB27_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB27_11 +; NO_SVE-NEXT: .LBB27_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB27_12 +; NO_SVE-NEXT: .LBB27_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB27_13 +; NO_SVE-NEXT: .LBB27_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB27_14 +; NO_SVE-NEXT: .LBB27_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB27_15 +; NO_SVE-NEXT: .LBB27_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB27_16 +; NO_SVE-NEXT: .LBB27_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB27_9 +; NO_SVE-NEXT: .LBB27_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB27_9: // %else20 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB27_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB27_2 +; NO_SVE-NEXT: .LBB27_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB27_3 +; NO_SVE-NEXT: .LBB27_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB27_4 +; NO_SVE-NEXT: .LBB27_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB27_5 +; NO_SVE-NEXT: .LBB27_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB27_6 +; NO_SVE-NEXT: .LBB27_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB27_7 +; NO_SVE-NEXT: .LBB27_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB27_8 +; NO_SVE-NEXT: b .LBB27_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -503,6 +5527,171 @@ } define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i16i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB28_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB28_3 +; NO_SVE-NEXT: b .LBB28_4 +; NO_SVE-NEXT: .LBB28_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB28_4 +; NO_SVE-NEXT: .LBB28_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB28_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB28_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB28_13 +; NO_SVE-NEXT: .LBB28_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB28_14 +; NO_SVE-NEXT: .LBB28_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB28_15 +; NO_SVE-NEXT: .LBB28_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB28_16 +; NO_SVE-NEXT: .LBB28_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB28_17 +; NO_SVE-NEXT: .LBB28_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB28_18 +; NO_SVE-NEXT: .LBB28_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB28_19 +; NO_SVE-NEXT: b .LBB28_20 +; NO_SVE-NEXT: .LBB28_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB28_6 +; NO_SVE-NEXT: .LBB28_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB28_7 +; NO_SVE-NEXT: .LBB28_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB28_8 +; NO_SVE-NEXT: .LBB28_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB28_9 +; NO_SVE-NEXT: .LBB28_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB28_10 +; NO_SVE-NEXT: .LBB28_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB28_11 +; NO_SVE-NEXT: .LBB28_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB28_20 +; NO_SVE-NEXT: .LBB28_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB28_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB28_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB28_29 +; NO_SVE-NEXT: .LBB28_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB28_30 +; NO_SVE-NEXT: .LBB28_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB28_31 +; NO_SVE-NEXT: .LBB28_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB28_32 +; NO_SVE-NEXT: .LBB28_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB28_27 +; NO_SVE-NEXT: .LBB28_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB28_27: // %else44 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB28_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB28_22 +; NO_SVE-NEXT: .LBB28_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB28_23 +; NO_SVE-NEXT: .LBB28_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB28_24 +; NO_SVE-NEXT: .LBB28_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB28_25 +; NO_SVE-NEXT: .LBB28_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB28_26 +; NO_SVE-NEXT: b .LBB28_27 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -519,6 +5708,100 @@ } define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i16i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB29_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB29_11 +; NO_SVE-NEXT: .LBB29_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB29_12 +; NO_SVE-NEXT: .LBB29_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB29_13 +; NO_SVE-NEXT: .LBB29_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB29_14 +; NO_SVE-NEXT: .LBB29_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB29_15 +; NO_SVE-NEXT: .LBB29_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB29_16 +; NO_SVE-NEXT: .LBB29_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB29_9 +; NO_SVE-NEXT: .LBB29_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB29_9: // %else20 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB29_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB29_2 +; NO_SVE-NEXT: .LBB29_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB29_3 +; NO_SVE-NEXT: .LBB29_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB29_4 +; NO_SVE-NEXT: .LBB29_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB29_5 +; NO_SVE-NEXT: .LBB29_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB29_6 +; NO_SVE-NEXT: .LBB29_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB29_7 +; NO_SVE-NEXT: .LBB29_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB29_8 +; NO_SVE-NEXT: b .LBB29_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -535,6 +5818,99 @@ } define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i32i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB30_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB30_3 +; NO_SVE-NEXT: b .LBB30_4 +; NO_SVE-NEXT: .LBB30_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB30_4 +; NO_SVE-NEXT: .LBB30_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB30_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB30_9 +; NO_SVE-NEXT: .LBB30_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB30_10 +; NO_SVE-NEXT: .LBB30_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB30_11 +; NO_SVE-NEXT: b .LBB30_12 +; NO_SVE-NEXT: .LBB30_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB30_6 +; NO_SVE-NEXT: .LBB30_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB30_7 +; NO_SVE-NEXT: .LBB30_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB30_12 +; NO_SVE-NEXT: .LBB30_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB30_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB30_15 +; NO_SVE-NEXT: .LBB30_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB30_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB30_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB30_14 +; NO_SVE-NEXT: b .LBB30_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -551,6 +5927,313 @@ } define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i16_m16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w11, w14, #1, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w16, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w11, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB31_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB31_3 +; NO_SVE-NEXT: b .LBB31_4 +; NO_SVE-NEXT: .LBB31_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB31_4 +; NO_SVE-NEXT: .LBB31_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB31_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB31_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB31_21 +; NO_SVE-NEXT: .LBB31_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB31_22 +; NO_SVE-NEXT: .LBB31_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB31_23 +; NO_SVE-NEXT: .LBB31_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB31_24 +; NO_SVE-NEXT: .LBB31_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB31_25 +; NO_SVE-NEXT: .LBB31_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB31_26 +; NO_SVE-NEXT: .LBB31_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB31_27 +; NO_SVE-NEXT: .LBB31_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB31_28 +; NO_SVE-NEXT: .LBB31_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB31_29 +; NO_SVE-NEXT: .LBB31_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB31_30 +; NO_SVE-NEXT: .LBB31_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB31_31 +; NO_SVE-NEXT: .LBB31_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB31_32 +; NO_SVE-NEXT: .LBB31_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB31_33 +; NO_SVE-NEXT: .LBB31_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB31_34 +; NO_SVE-NEXT: .LBB31_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB31_35 +; NO_SVE-NEXT: b .LBB31_36 +; NO_SVE-NEXT: .LBB31_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB31_6 +; NO_SVE-NEXT: .LBB31_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB31_7 +; NO_SVE-NEXT: .LBB31_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB31_8 +; NO_SVE-NEXT: .LBB31_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB31_9 +; NO_SVE-NEXT: .LBB31_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB31_10 +; NO_SVE-NEXT: .LBB31_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB31_11 +; NO_SVE-NEXT: .LBB31_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB31_12 +; NO_SVE-NEXT: .LBB31_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB31_13 +; NO_SVE-NEXT: .LBB31_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB31_14 +; NO_SVE-NEXT: .LBB31_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB31_15 +; NO_SVE-NEXT: .LBB31_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB31_16 +; NO_SVE-NEXT: .LBB31_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB31_17 +; NO_SVE-NEXT: .LBB31_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB31_18 +; NO_SVE-NEXT: .LBB31_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB31_19 +; NO_SVE-NEXT: .LBB31_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB31_36 +; NO_SVE-NEXT: .LBB31_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB31_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB31_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB31_53 +; NO_SVE-NEXT: .LBB31_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB31_54 +; NO_SVE-NEXT: .LBB31_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB31_55 +; NO_SVE-NEXT: .LBB31_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB31_56 +; NO_SVE-NEXT: .LBB31_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB31_57 +; NO_SVE-NEXT: .LBB31_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB31_58 +; NO_SVE-NEXT: .LBB31_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB31_59 +; NO_SVE-NEXT: .LBB31_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB31_60 +; NO_SVE-NEXT: .LBB31_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB31_61 +; NO_SVE-NEXT: .LBB31_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB31_62 +; NO_SVE-NEXT: .LBB31_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB31_63 +; NO_SVE-NEXT: .LBB31_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB31_64 +; NO_SVE-NEXT: .LBB31_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB31_51 +; NO_SVE-NEXT: .LBB31_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB31_51: // %else92 +; NO_SVE-NEXT: ushll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB31_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB31_38 +; NO_SVE-NEXT: .LBB31_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB31_39 +; NO_SVE-NEXT: .LBB31_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB31_40 +; NO_SVE-NEXT: .LBB31_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB31_41 +; NO_SVE-NEXT: .LBB31_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB31_42 +; NO_SVE-NEXT: .LBB31_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB31_43 +; NO_SVE-NEXT: .LBB31_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB31_44 +; NO_SVE-NEXT: .LBB31_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB31_45 +; NO_SVE-NEXT: .LBB31_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB31_46 +; NO_SVE-NEXT: .LBB31_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB31_47 +; NO_SVE-NEXT: .LBB31_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB31_48 +; NO_SVE-NEXT: .LBB31_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB31_49 +; NO_SVE-NEXT: .LBB31_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB31_50 +; NO_SVE-NEXT: b .LBB31_51 +; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -567,6 +6250,172 @@ } define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i8i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB32_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB32_19 +; NO_SVE-NEXT: .LBB32_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB32_20 +; NO_SVE-NEXT: .LBB32_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB32_21 +; NO_SVE-NEXT: .LBB32_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB32_22 +; NO_SVE-NEXT: .LBB32_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB32_23 +; NO_SVE-NEXT: .LBB32_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB32_24 +; NO_SVE-NEXT: .LBB32_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB32_25 +; NO_SVE-NEXT: .LBB32_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB32_26 +; NO_SVE-NEXT: .LBB32_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB32_27 +; NO_SVE-NEXT: .LBB32_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB32_28 +; NO_SVE-NEXT: .LBB32_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB32_29 +; NO_SVE-NEXT: .LBB32_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB32_30 +; NO_SVE-NEXT: .LBB32_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB32_31 +; NO_SVE-NEXT: .LBB32_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB32_32 +; NO_SVE-NEXT: .LBB32_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB32_17 +; NO_SVE-NEXT: .LBB32_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB32_17: // %else44 +; NO_SVE-NEXT: ushll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB32_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB32_2 +; NO_SVE-NEXT: .LBB32_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB32_3 +; NO_SVE-NEXT: .LBB32_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB32_4 +; NO_SVE-NEXT: .LBB32_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB32_5 +; NO_SVE-NEXT: .LBB32_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB32_6 +; NO_SVE-NEXT: .LBB32_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB32_7 +; NO_SVE-NEXT: .LBB32_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB32_8 +; NO_SVE-NEXT: .LBB32_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB32_9 +; NO_SVE-NEXT: .LBB32_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB32_10 +; NO_SVE-NEXT: .LBB32_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB32_11 +; NO_SVE-NEXT: .LBB32_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB32_12 +; NO_SVE-NEXT: .LBB32_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB32_13 +; NO_SVE-NEXT: .LBB32_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB32_14 +; NO_SVE-NEXT: .LBB32_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB32_15 +; NO_SVE-NEXT: .LBB32_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB32_16 +; NO_SVE-NEXT: b .LBB32_17 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -583,6 +6432,101 @@ } define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i8i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB33_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB33_11 +; NO_SVE-NEXT: .LBB33_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB33_12 +; NO_SVE-NEXT: .LBB33_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB33_13 +; NO_SVE-NEXT: .LBB33_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB33_14 +; NO_SVE-NEXT: .LBB33_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB33_15 +; NO_SVE-NEXT: .LBB33_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB33_16 +; NO_SVE-NEXT: .LBB33_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB33_9 +; NO_SVE-NEXT: .LBB33_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB33_9: // %else20 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB33_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB33_2 +; NO_SVE-NEXT: .LBB33_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB33_3 +; NO_SVE-NEXT: .LBB33_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB33_4 +; NO_SVE-NEXT: .LBB33_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB33_5 +; NO_SVE-NEXT: .LBB33_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB33_6 +; NO_SVE-NEXT: .LBB33_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB33_7 +; NO_SVE-NEXT: .LBB33_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB33_8 +; NO_SVE-NEXT: b .LBB33_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -599,6 +6543,171 @@ } define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i16i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB34_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB34_3 +; NO_SVE-NEXT: b .LBB34_4 +; NO_SVE-NEXT: .LBB34_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB34_4 +; NO_SVE-NEXT: .LBB34_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB34_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB34_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB34_13 +; NO_SVE-NEXT: .LBB34_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB34_14 +; NO_SVE-NEXT: .LBB34_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB34_15 +; NO_SVE-NEXT: .LBB34_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB34_16 +; NO_SVE-NEXT: .LBB34_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB34_17 +; NO_SVE-NEXT: .LBB34_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB34_18 +; NO_SVE-NEXT: .LBB34_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB34_19 +; NO_SVE-NEXT: b .LBB34_20 +; NO_SVE-NEXT: .LBB34_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB34_6 +; NO_SVE-NEXT: .LBB34_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB34_7 +; NO_SVE-NEXT: .LBB34_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB34_8 +; NO_SVE-NEXT: .LBB34_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB34_9 +; NO_SVE-NEXT: .LBB34_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB34_10 +; NO_SVE-NEXT: .LBB34_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB34_11 +; NO_SVE-NEXT: .LBB34_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB34_20 +; NO_SVE-NEXT: .LBB34_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB34_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB34_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB34_29 +; NO_SVE-NEXT: .LBB34_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB34_30 +; NO_SVE-NEXT: .LBB34_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB34_31 +; NO_SVE-NEXT: .LBB34_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB34_32 +; NO_SVE-NEXT: .LBB34_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB34_27 +; NO_SVE-NEXT: .LBB34_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB34_27: // %else44 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB34_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB34_22 +; NO_SVE-NEXT: .LBB34_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB34_23 +; NO_SVE-NEXT: .LBB34_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB34_24 +; NO_SVE-NEXT: .LBB34_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB34_25 +; NO_SVE-NEXT: .LBB34_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB34_26 +; NO_SVE-NEXT: b .LBB34_27 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -615,6 +6724,100 @@ } define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i16i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB35_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB35_11 +; NO_SVE-NEXT: .LBB35_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB35_12 +; NO_SVE-NEXT: .LBB35_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB35_13 +; NO_SVE-NEXT: .LBB35_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB35_14 +; NO_SVE-NEXT: .LBB35_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB35_15 +; NO_SVE-NEXT: .LBB35_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB35_16 +; NO_SVE-NEXT: .LBB35_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB35_9 +; NO_SVE-NEXT: .LBB35_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB35_9: // %else20 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB35_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB35_2 +; NO_SVE-NEXT: .LBB35_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB35_3 +; NO_SVE-NEXT: .LBB35_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB35_4 +; NO_SVE-NEXT: .LBB35_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB35_5 +; NO_SVE-NEXT: .LBB35_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB35_6 +; NO_SVE-NEXT: .LBB35_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB35_7 +; NO_SVE-NEXT: .LBB35_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB35_8 +; NO_SVE-NEXT: b .LBB35_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -631,6 +6834,99 @@ } define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i32i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB36_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB36_3 +; NO_SVE-NEXT: b .LBB36_4 +; NO_SVE-NEXT: .LBB36_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB36_4 +; NO_SVE-NEXT: .LBB36_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB36_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB36_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB36_9 +; NO_SVE-NEXT: .LBB36_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB36_10 +; NO_SVE-NEXT: .LBB36_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB36_11 +; NO_SVE-NEXT: b .LBB36_12 +; NO_SVE-NEXT: .LBB36_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB36_6 +; NO_SVE-NEXT: .LBB36_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB36_7 +; NO_SVE-NEXT: .LBB36_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB36_12 +; NO_SVE-NEXT: .LBB36_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB36_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB36_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB36_15 +; NO_SVE-NEXT: .LBB36_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB36_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB36_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB36_14 +; NO_SVE-NEXT: b .LBB36_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -647,6 +6943,1188 @@ } define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v128i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #48 +; NO_SVE-NEXT: .cfi_def_cfa_offset 48 +; NO_SVE-NEXT: str x19, [sp, #32] // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset w19, -16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w15, w10, w15, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w13, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #16, #16 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w17, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w13, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w9, w10, w12, lsl #14 +; NO_SVE-NEXT: orr w10, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w10, w13, #16, #16 +; NO_SVE-NEXT: bfi x10, x15, #32, #32 +; NO_SVE-NEXT: tbz w10, #0, .LBB37_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w10, #1, .LBB37_3 +; NO_SVE-NEXT: b .LBB37_4 +; NO_SVE-NEXT: .LBB37_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w10, #1, .LBB37_4 +; NO_SVE-NEXT: .LBB37_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_4: // %else2 +; NO_SVE-NEXT: tbnz w10, #2, .LBB37_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w10, #3, .LBB37_21 +; NO_SVE-NEXT: .LBB37_6: // %else8 +; NO_SVE-NEXT: tbnz w10, #4, .LBB37_22 +; NO_SVE-NEXT: .LBB37_7: // %else11 +; NO_SVE-NEXT: tbnz w10, #5, .LBB37_23 +; NO_SVE-NEXT: .LBB37_8: // %else14 +; NO_SVE-NEXT: tbnz w10, #6, .LBB37_24 +; NO_SVE-NEXT: .LBB37_9: // %else17 +; NO_SVE-NEXT: tbnz w10, #7, .LBB37_25 +; NO_SVE-NEXT: .LBB37_10: // %else20 +; NO_SVE-NEXT: tbnz w10, #8, .LBB37_26 +; NO_SVE-NEXT: .LBB37_11: // %else23 +; NO_SVE-NEXT: tbnz w10, #9, .LBB37_27 +; NO_SVE-NEXT: .LBB37_12: // %else26 +; NO_SVE-NEXT: tbnz w10, #10, .LBB37_28 +; NO_SVE-NEXT: .LBB37_13: // %else29 +; NO_SVE-NEXT: tbnz w10, #11, .LBB37_29 +; NO_SVE-NEXT: .LBB37_14: // %else32 +; NO_SVE-NEXT: tbnz w10, #12, .LBB37_30 +; NO_SVE-NEXT: .LBB37_15: // %else35 +; NO_SVE-NEXT: tbnz w10, #13, .LBB37_31 +; NO_SVE-NEXT: .LBB37_16: // %else38 +; NO_SVE-NEXT: tbnz w10, #14, .LBB37_32 +; NO_SVE-NEXT: .LBB37_17: // %else41 +; NO_SVE-NEXT: tbnz w10, #15, .LBB37_33 +; NO_SVE-NEXT: .LBB37_18: // %else44 +; NO_SVE-NEXT: tbz w10, #16, .LBB37_34 +; NO_SVE-NEXT: .LBB37_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w10, #17, .LBB37_35 +; NO_SVE-NEXT: b .LBB37_36 +; NO_SVE-NEXT: .LBB37_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #3, .LBB37_6 +; NO_SVE-NEXT: .LBB37_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #4, .LBB37_7 +; NO_SVE-NEXT: .LBB37_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #5, .LBB37_8 +; NO_SVE-NEXT: .LBB37_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #6, .LBB37_9 +; NO_SVE-NEXT: .LBB37_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #7, .LBB37_10 +; NO_SVE-NEXT: .LBB37_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #8, .LBB37_11 +; NO_SVE-NEXT: .LBB37_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #9, .LBB37_12 +; NO_SVE-NEXT: .LBB37_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #10, .LBB37_13 +; NO_SVE-NEXT: .LBB37_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #11, .LBB37_14 +; NO_SVE-NEXT: .LBB37_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #12, .LBB37_15 +; NO_SVE-NEXT: .LBB37_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #13, .LBB37_16 +; NO_SVE-NEXT: .LBB37_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #14, .LBB37_17 +; NO_SVE-NEXT: .LBB37_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #15, .LBB37_18 +; NO_SVE-NEXT: .LBB37_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w10, #16, .LBB37_19 +; NO_SVE-NEXT: .LBB37_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w10, #17, .LBB37_36 +; NO_SVE-NEXT: .LBB37_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_36: // %else50 +; NO_SVE-NEXT: tbnz w10, #18, .LBB37_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w10, #19, .LBB37_53 +; NO_SVE-NEXT: .LBB37_38: // %else56 +; NO_SVE-NEXT: tbnz w10, #20, .LBB37_54 +; NO_SVE-NEXT: .LBB37_39: // %else59 +; NO_SVE-NEXT: tbnz w10, #21, .LBB37_55 +; NO_SVE-NEXT: .LBB37_40: // %else62 +; NO_SVE-NEXT: tbnz w10, #22, .LBB37_56 +; NO_SVE-NEXT: .LBB37_41: // %else65 +; NO_SVE-NEXT: tbnz w10, #23, .LBB37_57 +; NO_SVE-NEXT: .LBB37_42: // %else68 +; NO_SVE-NEXT: tbnz w10, #24, .LBB37_58 +; NO_SVE-NEXT: .LBB37_43: // %else71 +; NO_SVE-NEXT: tbnz w10, #25, .LBB37_59 +; NO_SVE-NEXT: .LBB37_44: // %else74 +; NO_SVE-NEXT: tbnz w10, #26, .LBB37_60 +; NO_SVE-NEXT: .LBB37_45: // %else77 +; NO_SVE-NEXT: tbnz w10, #27, .LBB37_61 +; NO_SVE-NEXT: .LBB37_46: // %else80 +; NO_SVE-NEXT: tbnz w10, #28, .LBB37_62 +; NO_SVE-NEXT: .LBB37_47: // %else83 +; NO_SVE-NEXT: tbnz w10, #29, .LBB37_63 +; NO_SVE-NEXT: .LBB37_48: // %else86 +; NO_SVE-NEXT: tbnz w10, #30, .LBB37_64 +; NO_SVE-NEXT: .LBB37_49: // %else89 +; NO_SVE-NEXT: tbnz w10, #31, .LBB37_65 +; NO_SVE-NEXT: .LBB37_50: // %else92 +; NO_SVE-NEXT: tbz x10, #32, .LBB37_66 +; NO_SVE-NEXT: .LBB37_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x10, #33, .LBB37_67 +; NO_SVE-NEXT: b .LBB37_68 +; NO_SVE-NEXT: .LBB37_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #19, .LBB37_38 +; NO_SVE-NEXT: .LBB37_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #20, .LBB37_39 +; NO_SVE-NEXT: .LBB37_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #21, .LBB37_40 +; NO_SVE-NEXT: .LBB37_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #22, .LBB37_41 +; NO_SVE-NEXT: .LBB37_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #23, .LBB37_42 +; NO_SVE-NEXT: .LBB37_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #24, .LBB37_43 +; NO_SVE-NEXT: .LBB37_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #25, .LBB37_44 +; NO_SVE-NEXT: .LBB37_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #26, .LBB37_45 +; NO_SVE-NEXT: .LBB37_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #27, .LBB37_46 +; NO_SVE-NEXT: .LBB37_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #28, .LBB37_47 +; NO_SVE-NEXT: .LBB37_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #29, .LBB37_48 +; NO_SVE-NEXT: .LBB37_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #30, .LBB37_49 +; NO_SVE-NEXT: .LBB37_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #31, .LBB37_50 +; NO_SVE-NEXT: .LBB37_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x10, #32, .LBB37_51 +; NO_SVE-NEXT: .LBB37_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x10, #33, .LBB37_68 +; NO_SVE-NEXT: .LBB37_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_68: // %else98 +; NO_SVE-NEXT: tbnz x10, #34, .LBB37_91 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x10, #35, .LBB37_92 +; NO_SVE-NEXT: .LBB37_70: // %else104 +; NO_SVE-NEXT: tbnz x10, #36, .LBB37_93 +; NO_SVE-NEXT: .LBB37_71: // %else107 +; NO_SVE-NEXT: tbnz x10, #37, .LBB37_94 +; NO_SVE-NEXT: .LBB37_72: // %else110 +; NO_SVE-NEXT: tbnz x10, #38, .LBB37_95 +; NO_SVE-NEXT: .LBB37_73: // %else113 +; NO_SVE-NEXT: tbnz x10, #39, .LBB37_96 +; NO_SVE-NEXT: .LBB37_74: // %else116 +; NO_SVE-NEXT: tbnz x10, #40, .LBB37_97 +; NO_SVE-NEXT: .LBB37_75: // %else119 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: tbz x10, #41, .LBB37_77 +; NO_SVE-NEXT: .LBB37_76: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: .LBB37_77: // %else122 +; NO_SVE-NEXT: cmeq v7.16b, v4.16b, #0 +; NO_SVE-NEXT: ldp q3, q4, [x1, #96] +; NO_SVE-NEXT: cmeq v5.16b, v5.16b, #0 +; NO_SVE-NEXT: tbz x10, #42, .LBB37_79 +; NO_SVE-NEXT: // %bb.78: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: .LBB37_79: // %else125 +; NO_SVE-NEXT: umov w13, v7.b[1] +; NO_SVE-NEXT: umov w16, v7.b[0] +; NO_SVE-NEXT: umov w9, v5.b[1] +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: cmeq v6.16b, v4.16b, #0 +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, #0 +; NO_SVE-NEXT: tbz x10, #43, .LBB37_81 +; NO_SVE-NEXT: // %bb.80: // %cond.load127 +; NO_SVE-NEXT: add x11, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x11] +; NO_SVE-NEXT: .LBB37_81: // %else128 +; NO_SVE-NEXT: umov w15, v6.b[1] +; NO_SVE-NEXT: umov w18, v6.b[0] +; NO_SVE-NEXT: umov w11, v4.b[1] +; NO_SVE-NEXT: umov w1, v4.b[0] +; NO_SVE-NEXT: and w14, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w3, v7.b[2] +; NO_SVE-NEXT: umov w16, v5.b[2] +; NO_SVE-NEXT: and w2, w9, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: tbz x10, #44, .LBB37_83 +; NO_SVE-NEXT: // %bb.82: // %cond.load130 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: .LBB37_83: // %else131 +; NO_SVE-NEXT: and w17, w15, #0x1 +; NO_SVE-NEXT: and w9, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[2] +; NO_SVE-NEXT: and w15, w11, #0x1 +; NO_SVE-NEXT: and w11, w1, #0x1 +; NO_SVE-NEXT: umov w1, v4.b[2] +; NO_SVE-NEXT: bfi w13, w14, #1, #1 +; NO_SVE-NEXT: umov w4, v7.b[3] +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w12, w2, #1, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: tbz x10, #45, .LBB37_85 +; NO_SVE-NEXT: // %bb.84: // %cond.load133 +; NO_SVE-NEXT: add x2, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x2] +; NO_SVE-NEXT: .LBB37_85: // %else134 +; NO_SVE-NEXT: bfi w9, w17, #1, #1 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[3] +; NO_SVE-NEXT: bfi w11, w15, #1, #1 +; NO_SVE-NEXT: umov w2, v4.b[3] +; NO_SVE-NEXT: bfi w13, w3, #2, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[4] +; NO_SVE-NEXT: umov w15, v5.b[4] +; NO_SVE-NEXT: and w1, w1, #0x1 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #46, .LBB37_87 +; NO_SVE-NEXT: // %bb.86: // %cond.load136 +; NO_SVE-NEXT: add x16, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x16] +; NO_SVE-NEXT: .LBB37_87: // %else137 +; NO_SVE-NEXT: bfi w9, w17, #2, #1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w17, v6.b[4] +; NO_SVE-NEXT: bfi w11, w1, #2, #1 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[4] +; NO_SVE-NEXT: umov w5, v7.b[5] +; NO_SVE-NEXT: umov w1, v5.b[5] +; NO_SVE-NEXT: bfi w13, w3, #3, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: and w4, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #47, .LBB37_89 +; NO_SVE-NEXT: // %bb.88: // %cond.load139 +; NO_SVE-NEXT: add x14, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x14] +; NO_SVE-NEXT: .LBB37_89: // %else140 +; NO_SVE-NEXT: bfi w9, w16, #3, #1 +; NO_SVE-NEXT: umov w16, v6.b[5] +; NO_SVE-NEXT: bfi w11, w18, #3, #1 +; NO_SVE-NEXT: umov w18, v4.b[5] +; NO_SVE-NEXT: bfi w13, w3, #4, #1 +; NO_SVE-NEXT: umov w3, v7.b[6] +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: and w2, w5, #0x1 +; NO_SVE-NEXT: bfi w12, w4, #4, #1 +; NO_SVE-NEXT: and w1, w1, #0x1 +; NO_SVE-NEXT: tbz x10, #48, .LBB37_98 +; NO_SVE-NEXT: // %bb.90: // %cond.load142 +; NO_SVE-NEXT: add x4, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x4] +; NO_SVE-NEXT: b .LBB37_99 +; NO_SVE-NEXT: .LBB37_91: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x10, #35, .LBB37_70 +; NO_SVE-NEXT: .LBB37_92: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x10, #36, .LBB37_71 +; NO_SVE-NEXT: .LBB37_93: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x10, #37, .LBB37_72 +; NO_SVE-NEXT: .LBB37_94: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x10, #38, .LBB37_73 +; NO_SVE-NEXT: .LBB37_95: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x10, #39, .LBB37_74 +; NO_SVE-NEXT: .LBB37_96: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x10, #40, .LBB37_75 +; NO_SVE-NEXT: .LBB37_97: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: tbnz x10, #41, .LBB37_76 +; NO_SVE-NEXT: b .LBB37_77 +; NO_SVE-NEXT: .LBB37_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB37_99: // %else143 +; NO_SVE-NEXT: bfi w9, w15, #4, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v6.b[6] +; NO_SVE-NEXT: bfi w11, w17, #4, #1 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v4.b[6] +; NO_SVE-NEXT: bfi w13, w2, #5, #1 +; NO_SVE-NEXT: umov w4, v7.b[7] +; NO_SVE-NEXT: umov w2, v5.b[7] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w12, w1, #5, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #49, .LBB37_101 +; NO_SVE-NEXT: // %bb.100: // %cond.load145 +; NO_SVE-NEXT: add x1, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x1] +; NO_SVE-NEXT: .LBB37_101: // %else146 +; NO_SVE-NEXT: bfi w9, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: bfi w11, w17, #5, #1 +; NO_SVE-NEXT: umov w6, v4.b[7] +; NO_SVE-NEXT: orr w17, w13, w3, lsl #6 +; NO_SVE-NEXT: umov w19, v7.b[8] +; NO_SVE-NEXT: umov w13, v5.b[8] +; NO_SVE-NEXT: and w5, w18, #0x1 +; NO_SVE-NEXT: and w7, w4, #0x1 +; NO_SVE-NEXT: orr w18, w12, w14, lsl #6 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: tbz x10, #50, .LBB37_103 +; NO_SVE-NEXT: // %bb.102: // %cond.load148 +; NO_SVE-NEXT: add x12, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x12] +; NO_SVE-NEXT: .LBB37_103: // %else149 +; NO_SVE-NEXT: orr w12, w9, w15, lsl #6 +; NO_SVE-NEXT: umov w3, v6.b[8] +; NO_SVE-NEXT: orr w11, w11, w5, lsl #6 +; NO_SVE-NEXT: umov w5, v4.b[8] +; NO_SVE-NEXT: orr w14, w17, w7, lsl #7 +; NO_SVE-NEXT: umov w7, v7.b[9] +; NO_SVE-NEXT: umov w9, v5.b[9] +; NO_SVE-NEXT: and w2, w16, #0x1 +; NO_SVE-NEXT: and w4, w6, #0x1 +; NO_SVE-NEXT: and w6, w19, #0x1 +; NO_SVE-NEXT: orr w15, w18, w1, lsl #7 +; NO_SVE-NEXT: and w16, w13, #0x1 +; NO_SVE-NEXT: tbz x10, #51, .LBB37_105 +; NO_SVE-NEXT: // %bb.104: // %cond.load151 +; NO_SVE-NEXT: add x13, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x13] +; NO_SVE-NEXT: .LBB37_105: // %else152 +; NO_SVE-NEXT: orr w12, w12, w2, lsl #7 +; NO_SVE-NEXT: umov w18, v6.b[9] +; NO_SVE-NEXT: orr w13, w11, w4, lsl #7 +; NO_SVE-NEXT: umov w2, v4.b[9] +; NO_SVE-NEXT: umov w4, v7.b[10] +; NO_SVE-NEXT: umov w11, v5.b[10] +; NO_SVE-NEXT: and w17, w3, #0x1 +; NO_SVE-NEXT: and w1, w5, #0x1 +; NO_SVE-NEXT: orr w14, w14, w6, lsl #8 +; NO_SVE-NEXT: and w3, w7, #0x1 +; NO_SVE-NEXT: orr w15, w15, w16, lsl #8 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #52, .LBB37_107 +; NO_SVE-NEXT: // %bb.106: // %cond.load154 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x9] +; NO_SVE-NEXT: .LBB37_107: // %else155 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #8 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[10] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #8 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[10] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #9 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[11] +; NO_SVE-NEXT: umov w9, v5.b[11] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #9 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #53, .LBB37_109 +; NO_SVE-NEXT: // %bb.108: // %cond.load157 +; NO_SVE-NEXT: add x11, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x11] +; NO_SVE-NEXT: .LBB37_109: // %else158 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #9 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[11] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #9 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[11] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #10 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[12] +; NO_SVE-NEXT: umov w11, v5.b[12] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #10 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #54, .LBB37_111 +; NO_SVE-NEXT: // %bb.110: // %cond.load160 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x9] +; NO_SVE-NEXT: .LBB37_111: // %else161 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #10 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[12] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #10 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[12] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #11 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[13] +; NO_SVE-NEXT: umov w9, v5.b[13] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #11 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #55, .LBB37_113 +; NO_SVE-NEXT: // %bb.112: // %cond.load163 +; NO_SVE-NEXT: add x11, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x11] +; NO_SVE-NEXT: .LBB37_113: // %else164 +; NO_SVE-NEXT: orr w11, w12, w17, lsl #11 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[13] +; NO_SVE-NEXT: orr w12, w13, w1, lsl #11 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[13] +; NO_SVE-NEXT: orr w13, w14, w3, lsl #12 +; NO_SVE-NEXT: and w14, w4, #0x1 +; NO_SVE-NEXT: umov w3, v7.b[14] +; NO_SVE-NEXT: umov w4, v5.b[14] +; NO_SVE-NEXT: orr w5, w15, w16, lsl #12 +; NO_SVE-NEXT: and w6, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #56, .LBB37_115 +; NO_SVE-NEXT: // %bb.114: // %cond.load166 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x9] +; NO_SVE-NEXT: .LBB37_115: // %else167 +; NO_SVE-NEXT: orr w15, w11, w17, lsl #12 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w17, v6.b[14] +; NO_SVE-NEXT: orr w18, w12, w1, lsl #12 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[14] +; NO_SVE-NEXT: orr w9, w13, w14, lsl #13 +; NO_SVE-NEXT: and w11, w3, #0x1 +; NO_SVE-NEXT: orr w12, w5, w6, lsl #13 +; NO_SVE-NEXT: and w13, w4, #0x1 +; NO_SVE-NEXT: tbz x10, #57, .LBB37_117 +; NO_SVE-NEXT: // %bb.116: // %cond.load169 +; NO_SVE-NEXT: add x14, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x14] +; NO_SVE-NEXT: .LBB37_117: // %else170 +; NO_SVE-NEXT: orr w14, w15, w16, lsl #13 +; NO_SVE-NEXT: orr w15, w18, w1, lsl #13 +; NO_SVE-NEXT: umov w18, v7.b[15] +; NO_SVE-NEXT: umov w1, v5.b[15] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: orr w2, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: tbz x10, #58, .LBB37_119 +; NO_SVE-NEXT: // %bb.118: // %cond.load172 +; NO_SVE-NEXT: add x11, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x11] +; NO_SVE-NEXT: .LBB37_119: // %else173 +; NO_SVE-NEXT: umov w12, v6.b[15] +; NO_SVE-NEXT: orr w13, w15, w17, lsl #14 +; NO_SVE-NEXT: umov w15, v4.b[15] +; NO_SVE-NEXT: orr w11, w14, w16, lsl #14 +; NO_SVE-NEXT: orr w14, w2, w18, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w1, lsl #15 +; NO_SVE-NEXT: tbz x10, #59, .LBB37_121 +; NO_SVE-NEXT: // %bb.120: // %cond.load175 +; NO_SVE-NEXT: add x16, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x16] +; NO_SVE-NEXT: .LBB37_121: // %else176 +; NO_SVE-NEXT: orr w12, w11, w12, lsl #15 +; NO_SVE-NEXT: orr w11, w13, w15, lsl #15 +; NO_SVE-NEXT: bfi w9, w14, #16, #16 +; NO_SVE-NEXT: tbnz x10, #60, .LBB37_127 +; NO_SVE-NEXT: // %bb.122: // %else179 +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: tbnz x10, #61, .LBB37_128 +; NO_SVE-NEXT: .LBB37_123: // %else182 +; NO_SVE-NEXT: tbnz x10, #62, .LBB37_129 +; NO_SVE-NEXT: .LBB37_124: // %else185 +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbnz x10, #63, .LBB37_130 +; NO_SVE-NEXT: .LBB37_125: // %else188 +; NO_SVE-NEXT: tbz w9, #0, .LBB37_131 +; NO_SVE-NEXT: .LBB37_126: // %cond.load190 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #1, .LBB37_132 +; NO_SVE-NEXT: b .LBB37_133 +; NO_SVE-NEXT: .LBB37_127: // %cond.load178 +; NO_SVE-NEXT: add x13, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x13] +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: tbz x10, #61, .LBB37_123 +; NO_SVE-NEXT: .LBB37_128: // %cond.load181 +; NO_SVE-NEXT: add x12, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x12] +; NO_SVE-NEXT: tbz x10, #62, .LBB37_124 +; NO_SVE-NEXT: .LBB37_129: // %cond.load184 +; NO_SVE-NEXT: add x12, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x12] +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbz x10, #63, .LBB37_125 +; NO_SVE-NEXT: .LBB37_130: // %cond.load187 +; NO_SVE-NEXT: add x10, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #0, .LBB37_126 +; NO_SVE-NEXT: .LBB37_131: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #1, .LBB37_133 +; NO_SVE-NEXT: .LBB37_132: // %cond.load193 +; NO_SVE-NEXT: add x10, x0, #65 +; NO_SVE-NEXT: ld1 { v4.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_133: // %else194 +; NO_SVE-NEXT: tbnz w9, #2, .LBB37_149 +; NO_SVE-NEXT: // %bb.134: // %else197 +; NO_SVE-NEXT: tbnz w9, #3, .LBB37_150 +; NO_SVE-NEXT: .LBB37_135: // %else200 +; NO_SVE-NEXT: tbnz w9, #4, .LBB37_151 +; NO_SVE-NEXT: .LBB37_136: // %else203 +; NO_SVE-NEXT: tbnz w9, #5, .LBB37_152 +; NO_SVE-NEXT: .LBB37_137: // %else206 +; NO_SVE-NEXT: tbnz w9, #6, .LBB37_153 +; NO_SVE-NEXT: .LBB37_138: // %else209 +; NO_SVE-NEXT: tbnz w9, #7, .LBB37_154 +; NO_SVE-NEXT: .LBB37_139: // %else212 +; NO_SVE-NEXT: tbnz w9, #8, .LBB37_155 +; NO_SVE-NEXT: .LBB37_140: // %else215 +; NO_SVE-NEXT: tbnz w9, #9, .LBB37_156 +; NO_SVE-NEXT: .LBB37_141: // %else218 +; NO_SVE-NEXT: tbnz w9, #10, .LBB37_157 +; NO_SVE-NEXT: .LBB37_142: // %else221 +; NO_SVE-NEXT: tbnz w9, #11, .LBB37_158 +; NO_SVE-NEXT: .LBB37_143: // %else224 +; NO_SVE-NEXT: tbnz w9, #12, .LBB37_159 +; NO_SVE-NEXT: .LBB37_144: // %else227 +; NO_SVE-NEXT: tbnz w9, #13, .LBB37_160 +; NO_SVE-NEXT: .LBB37_145: // %else230 +; NO_SVE-NEXT: tbnz w9, #14, .LBB37_161 +; NO_SVE-NEXT: .LBB37_146: // %else233 +; NO_SVE-NEXT: tbnz w9, #15, .LBB37_162 +; NO_SVE-NEXT: .LBB37_147: // %else236 +; NO_SVE-NEXT: tbz w9, #16, .LBB37_163 +; NO_SVE-NEXT: .LBB37_148: // %cond.load238 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB37_164 +; NO_SVE-NEXT: b .LBB37_165 +; NO_SVE-NEXT: .LBB37_149: // %cond.load196 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB37_135 +; NO_SVE-NEXT: .LBB37_150: // %cond.load199 +; NO_SVE-NEXT: add x10, x0, #67 +; NO_SVE-NEXT: ld1 { v4.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB37_136 +; NO_SVE-NEXT: .LBB37_151: // %cond.load202 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB37_137 +; NO_SVE-NEXT: .LBB37_152: // %cond.load205 +; NO_SVE-NEXT: add x10, x0, #69 +; NO_SVE-NEXT: ld1 { v4.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB37_138 +; NO_SVE-NEXT: .LBB37_153: // %cond.load208 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB37_139 +; NO_SVE-NEXT: .LBB37_154: // %cond.load211 +; NO_SVE-NEXT: add x10, x0, #71 +; NO_SVE-NEXT: ld1 { v4.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB37_140 +; NO_SVE-NEXT: .LBB37_155: // %cond.load214 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB37_141 +; NO_SVE-NEXT: .LBB37_156: // %cond.load217 +; NO_SVE-NEXT: add x10, x0, #73 +; NO_SVE-NEXT: ld1 { v4.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB37_142 +; NO_SVE-NEXT: .LBB37_157: // %cond.load220 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB37_143 +; NO_SVE-NEXT: .LBB37_158: // %cond.load223 +; NO_SVE-NEXT: add x10, x0, #75 +; NO_SVE-NEXT: ld1 { v4.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB37_144 +; NO_SVE-NEXT: .LBB37_159: // %cond.load226 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB37_145 +; NO_SVE-NEXT: .LBB37_160: // %cond.load229 +; NO_SVE-NEXT: add x10, x0, #77 +; NO_SVE-NEXT: ld1 { v4.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB37_146 +; NO_SVE-NEXT: .LBB37_161: // %cond.load232 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB37_147 +; NO_SVE-NEXT: .LBB37_162: // %cond.load235 +; NO_SVE-NEXT: add x10, x0, #79 +; NO_SVE-NEXT: ld1 { v4.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB37_148 +; NO_SVE-NEXT: .LBB37_163: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #17, .LBB37_165 +; NO_SVE-NEXT: .LBB37_164: // %cond.load241 +; NO_SVE-NEXT: add x10, x0, #81 +; NO_SVE-NEXT: ld1 { v5.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_165: // %else242 +; NO_SVE-NEXT: tbnz w9, #18, .LBB37_181 +; NO_SVE-NEXT: // %bb.166: // %else245 +; NO_SVE-NEXT: tbnz w9, #19, .LBB37_182 +; NO_SVE-NEXT: .LBB37_167: // %else248 +; NO_SVE-NEXT: tbnz w9, #20, .LBB37_183 +; NO_SVE-NEXT: .LBB37_168: // %else251 +; NO_SVE-NEXT: tbnz w9, #21, .LBB37_184 +; NO_SVE-NEXT: .LBB37_169: // %else254 +; NO_SVE-NEXT: tbnz w9, #22, .LBB37_185 +; NO_SVE-NEXT: .LBB37_170: // %else257 +; NO_SVE-NEXT: tbnz w9, #23, .LBB37_186 +; NO_SVE-NEXT: .LBB37_171: // %else260 +; NO_SVE-NEXT: tbnz w9, #24, .LBB37_187 +; NO_SVE-NEXT: .LBB37_172: // %else263 +; NO_SVE-NEXT: tbnz w9, #25, .LBB37_188 +; NO_SVE-NEXT: .LBB37_173: // %else266 +; NO_SVE-NEXT: tbnz w9, #26, .LBB37_189 +; NO_SVE-NEXT: .LBB37_174: // %else269 +; NO_SVE-NEXT: tbnz w9, #27, .LBB37_190 +; NO_SVE-NEXT: .LBB37_175: // %else272 +; NO_SVE-NEXT: tbnz w9, #28, .LBB37_191 +; NO_SVE-NEXT: .LBB37_176: // %else275 +; NO_SVE-NEXT: tbnz w9, #29, .LBB37_192 +; NO_SVE-NEXT: .LBB37_177: // %else278 +; NO_SVE-NEXT: tbnz w9, #30, .LBB37_193 +; NO_SVE-NEXT: .LBB37_178: // %else281 +; NO_SVE-NEXT: tbnz w9, #31, .LBB37_194 +; NO_SVE-NEXT: .LBB37_179: // %else284 +; NO_SVE-NEXT: tbz x9, #32, .LBB37_195 +; NO_SVE-NEXT: .LBB37_180: // %cond.load286 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB37_196 +; NO_SVE-NEXT: b .LBB37_197 +; NO_SVE-NEXT: .LBB37_181: // %cond.load244 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB37_167 +; NO_SVE-NEXT: .LBB37_182: // %cond.load247 +; NO_SVE-NEXT: add x10, x0, #83 +; NO_SVE-NEXT: ld1 { v5.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB37_168 +; NO_SVE-NEXT: .LBB37_183: // %cond.load250 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB37_169 +; NO_SVE-NEXT: .LBB37_184: // %cond.load253 +; NO_SVE-NEXT: add x10, x0, #85 +; NO_SVE-NEXT: ld1 { v5.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB37_170 +; NO_SVE-NEXT: .LBB37_185: // %cond.load256 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB37_171 +; NO_SVE-NEXT: .LBB37_186: // %cond.load259 +; NO_SVE-NEXT: add x10, x0, #87 +; NO_SVE-NEXT: ld1 { v5.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB37_172 +; NO_SVE-NEXT: .LBB37_187: // %cond.load262 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB37_173 +; NO_SVE-NEXT: .LBB37_188: // %cond.load265 +; NO_SVE-NEXT: add x10, x0, #89 +; NO_SVE-NEXT: ld1 { v5.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB37_174 +; NO_SVE-NEXT: .LBB37_189: // %cond.load268 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB37_175 +; NO_SVE-NEXT: .LBB37_190: // %cond.load271 +; NO_SVE-NEXT: add x10, x0, #91 +; NO_SVE-NEXT: ld1 { v5.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB37_176 +; NO_SVE-NEXT: .LBB37_191: // %cond.load274 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB37_177 +; NO_SVE-NEXT: .LBB37_192: // %cond.load277 +; NO_SVE-NEXT: add x10, x0, #93 +; NO_SVE-NEXT: ld1 { v5.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB37_178 +; NO_SVE-NEXT: .LBB37_193: // %cond.load280 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB37_179 +; NO_SVE-NEXT: .LBB37_194: // %cond.load283 +; NO_SVE-NEXT: add x10, x0, #95 +; NO_SVE-NEXT: ld1 { v5.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB37_180 +; NO_SVE-NEXT: .LBB37_195: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #33, .LBB37_197 +; NO_SVE-NEXT: .LBB37_196: // %cond.load289 +; NO_SVE-NEXT: add x10, x0, #97 +; NO_SVE-NEXT: ld1 { v6.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_197: // %else290 +; NO_SVE-NEXT: tbnz x9, #34, .LBB37_213 +; NO_SVE-NEXT: // %bb.198: // %else293 +; NO_SVE-NEXT: tbnz x9, #35, .LBB37_214 +; NO_SVE-NEXT: .LBB37_199: // %else296 +; NO_SVE-NEXT: tbnz x9, #36, .LBB37_215 +; NO_SVE-NEXT: .LBB37_200: // %else299 +; NO_SVE-NEXT: tbnz x9, #37, .LBB37_216 +; NO_SVE-NEXT: .LBB37_201: // %else302 +; NO_SVE-NEXT: tbnz x9, #38, .LBB37_217 +; NO_SVE-NEXT: .LBB37_202: // %else305 +; NO_SVE-NEXT: tbnz x9, #39, .LBB37_218 +; NO_SVE-NEXT: .LBB37_203: // %else308 +; NO_SVE-NEXT: tbnz x9, #40, .LBB37_219 +; NO_SVE-NEXT: .LBB37_204: // %else311 +; NO_SVE-NEXT: tbnz x9, #41, .LBB37_220 +; NO_SVE-NEXT: .LBB37_205: // %else314 +; NO_SVE-NEXT: tbnz x9, #42, .LBB37_221 +; NO_SVE-NEXT: .LBB37_206: // %else317 +; NO_SVE-NEXT: tbnz x9, #43, .LBB37_222 +; NO_SVE-NEXT: .LBB37_207: // %else320 +; NO_SVE-NEXT: tbnz x9, #44, .LBB37_223 +; NO_SVE-NEXT: .LBB37_208: // %else323 +; NO_SVE-NEXT: tbnz x9, #45, .LBB37_224 +; NO_SVE-NEXT: .LBB37_209: // %else326 +; NO_SVE-NEXT: tbnz x9, #46, .LBB37_225 +; NO_SVE-NEXT: .LBB37_210: // %else329 +; NO_SVE-NEXT: tbnz x9, #47, .LBB37_226 +; NO_SVE-NEXT: .LBB37_211: // %else332 +; NO_SVE-NEXT: tbz x9, #48, .LBB37_227 +; NO_SVE-NEXT: .LBB37_212: // %cond.load334 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB37_228 +; NO_SVE-NEXT: b .LBB37_229 +; NO_SVE-NEXT: .LBB37_213: // %cond.load292 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB37_199 +; NO_SVE-NEXT: .LBB37_214: // %cond.load295 +; NO_SVE-NEXT: add x10, x0, #99 +; NO_SVE-NEXT: ld1 { v6.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB37_200 +; NO_SVE-NEXT: .LBB37_215: // %cond.load298 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB37_201 +; NO_SVE-NEXT: .LBB37_216: // %cond.load301 +; NO_SVE-NEXT: add x10, x0, #101 +; NO_SVE-NEXT: ld1 { v6.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB37_202 +; NO_SVE-NEXT: .LBB37_217: // %cond.load304 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB37_203 +; NO_SVE-NEXT: .LBB37_218: // %cond.load307 +; NO_SVE-NEXT: add x10, x0, #103 +; NO_SVE-NEXT: ld1 { v6.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB37_204 +; NO_SVE-NEXT: .LBB37_219: // %cond.load310 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB37_205 +; NO_SVE-NEXT: .LBB37_220: // %cond.load313 +; NO_SVE-NEXT: add x10, x0, #105 +; NO_SVE-NEXT: ld1 { v6.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB37_206 +; NO_SVE-NEXT: .LBB37_221: // %cond.load316 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB37_207 +; NO_SVE-NEXT: .LBB37_222: // %cond.load319 +; NO_SVE-NEXT: add x10, x0, #107 +; NO_SVE-NEXT: ld1 { v6.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB37_208 +; NO_SVE-NEXT: .LBB37_223: // %cond.load322 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB37_209 +; NO_SVE-NEXT: .LBB37_224: // %cond.load325 +; NO_SVE-NEXT: add x10, x0, #109 +; NO_SVE-NEXT: ld1 { v6.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB37_210 +; NO_SVE-NEXT: .LBB37_225: // %cond.load328 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB37_211 +; NO_SVE-NEXT: .LBB37_226: // %cond.load331 +; NO_SVE-NEXT: add x10, x0, #111 +; NO_SVE-NEXT: ld1 { v6.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB37_212 +; NO_SVE-NEXT: .LBB37_227: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #49, .LBB37_229 +; NO_SVE-NEXT: .LBB37_228: // %cond.load337 +; NO_SVE-NEXT: add x10, x0, #113 +; NO_SVE-NEXT: ld1 { v7.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_229: // %else338 +; NO_SVE-NEXT: tbnz x9, #50, .LBB37_245 +; NO_SVE-NEXT: // %bb.230: // %else341 +; NO_SVE-NEXT: tbnz x9, #51, .LBB37_246 +; NO_SVE-NEXT: .LBB37_231: // %else344 +; NO_SVE-NEXT: tbnz x9, #52, .LBB37_247 +; NO_SVE-NEXT: .LBB37_232: // %else347 +; NO_SVE-NEXT: tbnz x9, #53, .LBB37_248 +; NO_SVE-NEXT: .LBB37_233: // %else350 +; NO_SVE-NEXT: tbnz x9, #54, .LBB37_249 +; NO_SVE-NEXT: .LBB37_234: // %else353 +; NO_SVE-NEXT: tbnz x9, #55, .LBB37_250 +; NO_SVE-NEXT: .LBB37_235: // %else356 +; NO_SVE-NEXT: tbnz x9, #56, .LBB37_251 +; NO_SVE-NEXT: .LBB37_236: // %else359 +; NO_SVE-NEXT: tbnz x9, #57, .LBB37_252 +; NO_SVE-NEXT: .LBB37_237: // %else362 +; NO_SVE-NEXT: tbnz x9, #58, .LBB37_253 +; NO_SVE-NEXT: .LBB37_238: // %else365 +; NO_SVE-NEXT: tbnz x9, #59, .LBB37_254 +; NO_SVE-NEXT: .LBB37_239: // %else368 +; NO_SVE-NEXT: tbnz x9, #60, .LBB37_255 +; NO_SVE-NEXT: .LBB37_240: // %else371 +; NO_SVE-NEXT: tbnz x9, #61, .LBB37_256 +; NO_SVE-NEXT: .LBB37_241: // %else374 +; NO_SVE-NEXT: tbnz x9, #62, .LBB37_257 +; NO_SVE-NEXT: .LBB37_242: // %else377 +; NO_SVE-NEXT: tbz x9, #63, .LBB37_244 +; NO_SVE-NEXT: .LBB37_243: // %cond.load379 +; NO_SVE-NEXT: add x9, x0, #127 +; NO_SVE-NEXT: ld1 { v7.b }[15], [x9] +; NO_SVE-NEXT: .LBB37_244: // %else380 +; NO_SVE-NEXT: sshll2 v16.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v17.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #32] +; NO_SVE-NEXT: sshll2 v1.8h, v3.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: sshll v0.8h, v3.8b, #0 +; NO_SVE-NEXT: sshll2 v2.8h, v4.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #96] +; NO_SVE-NEXT: sshll v1.8h, v4.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v5.16b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #128] +; NO_SVE-NEXT: sshll v2.8h, v5.8b, #0 +; NO_SVE-NEXT: sshll2 v1.8h, v6.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.8h, v6.8b, #0 +; NO_SVE-NEXT: sshll2 v2.8h, v7.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #192] +; NO_SVE-NEXT: sshll v1.8h, v7.8b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #224] +; NO_SVE-NEXT: ldr x19, [sp, #32] // 8-byte Folded Reload +; NO_SVE-NEXT: add sp, sp, #48 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB37_245: // %cond.load340 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB37_231 +; NO_SVE-NEXT: .LBB37_246: // %cond.load343 +; NO_SVE-NEXT: add x10, x0, #115 +; NO_SVE-NEXT: ld1 { v7.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB37_232 +; NO_SVE-NEXT: .LBB37_247: // %cond.load346 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB37_233 +; NO_SVE-NEXT: .LBB37_248: // %cond.load349 +; NO_SVE-NEXT: add x10, x0, #117 +; NO_SVE-NEXT: ld1 { v7.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB37_234 +; NO_SVE-NEXT: .LBB37_249: // %cond.load352 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB37_235 +; NO_SVE-NEXT: .LBB37_250: // %cond.load355 +; NO_SVE-NEXT: add x10, x0, #119 +; NO_SVE-NEXT: ld1 { v7.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB37_236 +; NO_SVE-NEXT: .LBB37_251: // %cond.load358 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB37_237 +; NO_SVE-NEXT: .LBB37_252: // %cond.load361 +; NO_SVE-NEXT: add x10, x0, #121 +; NO_SVE-NEXT: ld1 { v7.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB37_238 +; NO_SVE-NEXT: .LBB37_253: // %cond.load364 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB37_239 +; NO_SVE-NEXT: .LBB37_254: // %cond.load367 +; NO_SVE-NEXT: add x10, x0, #123 +; NO_SVE-NEXT: ld1 { v7.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB37_240 +; NO_SVE-NEXT: .LBB37_255: // %cond.load370 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB37_241 +; NO_SVE-NEXT: .LBB37_256: // %cond.load373 +; NO_SVE-NEXT: add x10, x0, #125 +; NO_SVE-NEXT: ld1 { v7.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB37_242 +; NO_SVE-NEXT: .LBB37_257: // %cond.load376 +; NO_SVE-NEXT: add x10, x0, #126 +; NO_SVE-NEXT: ld1 { v7.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB37_243 +; NO_SVE-NEXT: b .LBB37_244 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -663,6 +8141,625 @@ } define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v64i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: bfi w12, w13, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w12, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: bfi w12, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w12, w12, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v0.b[10] +; NO_SVE-NEXT: orr w11, w11, w17, lsl #13 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[13] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB38_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB38_3 +; NO_SVE-NEXT: b .LBB38_4 +; NO_SVE-NEXT: .LBB38_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB38_4 +; NO_SVE-NEXT: .LBB38_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB38_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB38_21 +; NO_SVE-NEXT: .LBB38_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB38_22 +; NO_SVE-NEXT: .LBB38_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB38_23 +; NO_SVE-NEXT: .LBB38_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB38_24 +; NO_SVE-NEXT: .LBB38_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB38_25 +; NO_SVE-NEXT: .LBB38_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB38_26 +; NO_SVE-NEXT: .LBB38_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB38_27 +; NO_SVE-NEXT: .LBB38_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB38_28 +; NO_SVE-NEXT: .LBB38_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB38_29 +; NO_SVE-NEXT: .LBB38_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB38_30 +; NO_SVE-NEXT: .LBB38_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB38_31 +; NO_SVE-NEXT: .LBB38_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB38_32 +; NO_SVE-NEXT: .LBB38_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB38_33 +; NO_SVE-NEXT: .LBB38_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB38_34 +; NO_SVE-NEXT: .LBB38_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB38_35 +; NO_SVE-NEXT: b .LBB38_36 +; NO_SVE-NEXT: .LBB38_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB38_6 +; NO_SVE-NEXT: .LBB38_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB38_7 +; NO_SVE-NEXT: .LBB38_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB38_8 +; NO_SVE-NEXT: .LBB38_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB38_9 +; NO_SVE-NEXT: .LBB38_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB38_10 +; NO_SVE-NEXT: .LBB38_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB38_11 +; NO_SVE-NEXT: .LBB38_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB38_12 +; NO_SVE-NEXT: .LBB38_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB38_13 +; NO_SVE-NEXT: .LBB38_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB38_14 +; NO_SVE-NEXT: .LBB38_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB38_15 +; NO_SVE-NEXT: .LBB38_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB38_16 +; NO_SVE-NEXT: .LBB38_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB38_17 +; NO_SVE-NEXT: .LBB38_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB38_18 +; NO_SVE-NEXT: .LBB38_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB38_19 +; NO_SVE-NEXT: .LBB38_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB38_36 +; NO_SVE-NEXT: .LBB38_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB38_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB38_53 +; NO_SVE-NEXT: .LBB38_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB38_54 +; NO_SVE-NEXT: .LBB38_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB38_55 +; NO_SVE-NEXT: .LBB38_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB38_56 +; NO_SVE-NEXT: .LBB38_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB38_57 +; NO_SVE-NEXT: .LBB38_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB38_58 +; NO_SVE-NEXT: .LBB38_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB38_59 +; NO_SVE-NEXT: .LBB38_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB38_60 +; NO_SVE-NEXT: .LBB38_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB38_61 +; NO_SVE-NEXT: .LBB38_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB38_62 +; NO_SVE-NEXT: .LBB38_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB38_63 +; NO_SVE-NEXT: .LBB38_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB38_64 +; NO_SVE-NEXT: .LBB38_49: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB38_65 +; NO_SVE-NEXT: .LBB38_50: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB38_66 +; NO_SVE-NEXT: .LBB38_51: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB38_67 +; NO_SVE-NEXT: b .LBB38_68 +; NO_SVE-NEXT: .LBB38_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB38_38 +; NO_SVE-NEXT: .LBB38_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB38_39 +; NO_SVE-NEXT: .LBB38_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB38_40 +; NO_SVE-NEXT: .LBB38_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB38_41 +; NO_SVE-NEXT: .LBB38_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB38_42 +; NO_SVE-NEXT: .LBB38_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB38_43 +; NO_SVE-NEXT: .LBB38_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB38_44 +; NO_SVE-NEXT: .LBB38_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB38_45 +; NO_SVE-NEXT: .LBB38_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB38_46 +; NO_SVE-NEXT: .LBB38_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB38_47 +; NO_SVE-NEXT: .LBB38_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB38_48 +; NO_SVE-NEXT: .LBB38_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB38_49 +; NO_SVE-NEXT: .LBB38_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB38_50 +; NO_SVE-NEXT: .LBB38_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB38_51 +; NO_SVE-NEXT: .LBB38_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x9, #33, .LBB38_68 +; NO_SVE-NEXT: .LBB38_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB38_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB38_85 +; NO_SVE-NEXT: .LBB38_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB38_86 +; NO_SVE-NEXT: .LBB38_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB38_87 +; NO_SVE-NEXT: .LBB38_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB38_88 +; NO_SVE-NEXT: .LBB38_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB38_89 +; NO_SVE-NEXT: .LBB38_74: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB38_90 +; NO_SVE-NEXT: .LBB38_75: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB38_91 +; NO_SVE-NEXT: .LBB38_76: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB38_92 +; NO_SVE-NEXT: .LBB38_77: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB38_93 +; NO_SVE-NEXT: .LBB38_78: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB38_94 +; NO_SVE-NEXT: .LBB38_79: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB38_95 +; NO_SVE-NEXT: .LBB38_80: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB38_96 +; NO_SVE-NEXT: .LBB38_81: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB38_97 +; NO_SVE-NEXT: .LBB38_82: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB38_98 +; NO_SVE-NEXT: .LBB38_83: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB38_99 +; NO_SVE-NEXT: b .LBB38_100 +; NO_SVE-NEXT: .LBB38_84: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB38_70 +; NO_SVE-NEXT: .LBB38_85: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB38_71 +; NO_SVE-NEXT: .LBB38_86: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB38_72 +; NO_SVE-NEXT: .LBB38_87: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB38_73 +; NO_SVE-NEXT: .LBB38_88: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB38_74 +; NO_SVE-NEXT: .LBB38_89: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB38_75 +; NO_SVE-NEXT: .LBB38_90: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB38_76 +; NO_SVE-NEXT: .LBB38_91: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB38_77 +; NO_SVE-NEXT: .LBB38_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB38_78 +; NO_SVE-NEXT: .LBB38_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB38_79 +; NO_SVE-NEXT: .LBB38_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB38_80 +; NO_SVE-NEXT: .LBB38_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB38_81 +; NO_SVE-NEXT: .LBB38_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB38_82 +; NO_SVE-NEXT: .LBB38_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB38_83 +; NO_SVE-NEXT: .LBB38_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x9, #49, .LBB38_100 +; NO_SVE-NEXT: .LBB38_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB38_116 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB38_117 +; NO_SVE-NEXT: .LBB38_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB38_118 +; NO_SVE-NEXT: .LBB38_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB38_119 +; NO_SVE-NEXT: .LBB38_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB38_120 +; NO_SVE-NEXT: .LBB38_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB38_121 +; NO_SVE-NEXT: .LBB38_106: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB38_122 +; NO_SVE-NEXT: .LBB38_107: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB38_123 +; NO_SVE-NEXT: .LBB38_108: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB38_124 +; NO_SVE-NEXT: .LBB38_109: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB38_125 +; NO_SVE-NEXT: .LBB38_110: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB38_126 +; NO_SVE-NEXT: .LBB38_111: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB38_127 +; NO_SVE-NEXT: .LBB38_112: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB38_128 +; NO_SVE-NEXT: .LBB38_113: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB38_115 +; NO_SVE-NEXT: .LBB38_114: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x9] +; NO_SVE-NEXT: .LBB38_115: // %else188 +; NO_SVE-NEXT: sshll v6.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll2 v5.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v7.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: sshll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: sshll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: sshll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: sshll v5.4s, v5.4h, #0 +; NO_SVE-NEXT: sshll2 v4.8h, v3.16b, #0 +; NO_SVE-NEXT: sshll2 v6.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: sshll v0.4s, v4.4h, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll v2.8h, v3.8b, #0 +; NO_SVE-NEXT: sshll2 v17.4s, v7.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll v7.4s, v7.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB38_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB38_102 +; NO_SVE-NEXT: .LBB38_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB38_103 +; NO_SVE-NEXT: .LBB38_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB38_104 +; NO_SVE-NEXT: .LBB38_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB38_105 +; NO_SVE-NEXT: .LBB38_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB38_106 +; NO_SVE-NEXT: .LBB38_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB38_107 +; NO_SVE-NEXT: .LBB38_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB38_108 +; NO_SVE-NEXT: .LBB38_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB38_109 +; NO_SVE-NEXT: .LBB38_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB38_110 +; NO_SVE-NEXT: .LBB38_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB38_111 +; NO_SVE-NEXT: .LBB38_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB38_112 +; NO_SVE-NEXT: .LBB38_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB38_113 +; NO_SVE-NEXT: .LBB38_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB38_114 +; NO_SVE-NEXT: b .LBB38_115 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -679,6 +8776,338 @@ } define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: umov w9, v0.b[8] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w11, v0.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[12] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w16, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w16, w10, #4, #1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: umov w10, v0.b[14] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: orr w10, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[12] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[14] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v0.b[15] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB39_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB39_3 +; NO_SVE-NEXT: b .LBB39_4 +; NO_SVE-NEXT: .LBB39_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB39_4 +; NO_SVE-NEXT: .LBB39_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB39_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB39_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB39_21 +; NO_SVE-NEXT: .LBB39_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB39_22 +; NO_SVE-NEXT: .LBB39_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB39_23 +; NO_SVE-NEXT: .LBB39_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB39_24 +; NO_SVE-NEXT: .LBB39_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB39_25 +; NO_SVE-NEXT: .LBB39_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB39_26 +; NO_SVE-NEXT: .LBB39_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB39_27 +; NO_SVE-NEXT: .LBB39_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB39_28 +; NO_SVE-NEXT: .LBB39_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB39_29 +; NO_SVE-NEXT: .LBB39_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB39_30 +; NO_SVE-NEXT: .LBB39_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB39_31 +; NO_SVE-NEXT: .LBB39_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB39_32 +; NO_SVE-NEXT: .LBB39_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB39_33 +; NO_SVE-NEXT: .LBB39_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB39_34 +; NO_SVE-NEXT: .LBB39_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB39_35 +; NO_SVE-NEXT: b .LBB39_36 +; NO_SVE-NEXT: .LBB39_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB39_6 +; NO_SVE-NEXT: .LBB39_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB39_7 +; NO_SVE-NEXT: .LBB39_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB39_8 +; NO_SVE-NEXT: .LBB39_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB39_9 +; NO_SVE-NEXT: .LBB39_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB39_10 +; NO_SVE-NEXT: .LBB39_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB39_11 +; NO_SVE-NEXT: .LBB39_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB39_12 +; NO_SVE-NEXT: .LBB39_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB39_13 +; NO_SVE-NEXT: .LBB39_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB39_14 +; NO_SVE-NEXT: .LBB39_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB39_15 +; NO_SVE-NEXT: .LBB39_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB39_16 +; NO_SVE-NEXT: .LBB39_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB39_17 +; NO_SVE-NEXT: .LBB39_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB39_18 +; NO_SVE-NEXT: .LBB39_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB39_19 +; NO_SVE-NEXT: .LBB39_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB39_36 +; NO_SVE-NEXT: .LBB39_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB39_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB39_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB39_53 +; NO_SVE-NEXT: .LBB39_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB39_54 +; NO_SVE-NEXT: .LBB39_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB39_55 +; NO_SVE-NEXT: .LBB39_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB39_56 +; NO_SVE-NEXT: .LBB39_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB39_57 +; NO_SVE-NEXT: .LBB39_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB39_58 +; NO_SVE-NEXT: .LBB39_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB39_59 +; NO_SVE-NEXT: .LBB39_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB39_60 +; NO_SVE-NEXT: .LBB39_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB39_61 +; NO_SVE-NEXT: .LBB39_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB39_62 +; NO_SVE-NEXT: .LBB39_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB39_63 +; NO_SVE-NEXT: .LBB39_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB39_64 +; NO_SVE-NEXT: .LBB39_49: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB39_51 +; NO_SVE-NEXT: .LBB39_50: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: .LBB39_51: // %else92 +; NO_SVE-NEXT: sshll v3.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v1.8b, #0 +; NO_SVE-NEXT: sshll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #32] +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #224] +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB39_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB39_38 +; NO_SVE-NEXT: .LBB39_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB39_39 +; NO_SVE-NEXT: .LBB39_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB39_40 +; NO_SVE-NEXT: .LBB39_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB39_41 +; NO_SVE-NEXT: .LBB39_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB39_42 +; NO_SVE-NEXT: .LBB39_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB39_43 +; NO_SVE-NEXT: .LBB39_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB39_44 +; NO_SVE-NEXT: .LBB39_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB39_45 +; NO_SVE-NEXT: .LBB39_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB39_46 +; NO_SVE-NEXT: .LBB39_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB39_47 +; NO_SVE-NEXT: .LBB39_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB39_48 +; NO_SVE-NEXT: .LBB39_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB39_49 +; NO_SVE-NEXT: .LBB39_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB39_50 +; NO_SVE-NEXT: b .LBB39_51 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -695,6 +9124,635 @@ } define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x1, #96] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: umov w10, v3.b[0] +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: xtn v6.8b, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: umov w17, v6.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v6.b[2] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w11, v6.b[3] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: cmeq v2.8h, v5.8h, #0 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: umov w13, v6.b[5] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v2.8h +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: umov w14, v6.b[6] +; NO_SVE-NEXT: umov w15, v5.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: bfi w10, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: cmeq v2.8h, v4.8h, #0 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v5.b[7] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: ldp q7, q3, [x1, #32] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #6 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v2.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: cmeq v4.8h, v7.8h, #0 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #9 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[1] +; NO_SVE-NEXT: umov w15, v4.b[0] +; NO_SVE-NEXT: umov w17, v4.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[4] +; NO_SVE-NEXT: bfi w12, w11, #1, #1 +; NO_SVE-NEXT: umov w11, v4.b[5] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: umov w17, v4.b[7] +; NO_SVE-NEXT: bfi w12, w13, #2, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[6] +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: umov w13, v3.b[0] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: bfi w12, w11, #5, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #6 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #7 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: umov w17, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w18, v1.b[4] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #11 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[3] +; NO_SVE-NEXT: umov w1, v1.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w15, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w15, w18, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w16, w1, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #13 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: umov w14, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w12, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: bfi w12, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #7 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #14 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #8 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v3.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB40_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB40_3 +; NO_SVE-NEXT: b .LBB40_4 +; NO_SVE-NEXT: .LBB40_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB40_4 +; NO_SVE-NEXT: .LBB40_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB40_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB40_13 +; NO_SVE-NEXT: .LBB40_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB40_14 +; NO_SVE-NEXT: .LBB40_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB40_15 +; NO_SVE-NEXT: .LBB40_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB40_16 +; NO_SVE-NEXT: .LBB40_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB40_17 +; NO_SVE-NEXT: .LBB40_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB40_18 +; NO_SVE-NEXT: .LBB40_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB40_19 +; NO_SVE-NEXT: b .LBB40_20 +; NO_SVE-NEXT: .LBB40_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB40_6 +; NO_SVE-NEXT: .LBB40_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB40_7 +; NO_SVE-NEXT: .LBB40_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB40_8 +; NO_SVE-NEXT: .LBB40_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB40_9 +; NO_SVE-NEXT: .LBB40_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB40_10 +; NO_SVE-NEXT: .LBB40_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB40_11 +; NO_SVE-NEXT: .LBB40_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB40_20 +; NO_SVE-NEXT: .LBB40_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB40_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB40_29 +; NO_SVE-NEXT: .LBB40_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB40_30 +; NO_SVE-NEXT: .LBB40_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB40_31 +; NO_SVE-NEXT: .LBB40_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB40_32 +; NO_SVE-NEXT: .LBB40_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB40_33 +; NO_SVE-NEXT: .LBB40_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB40_34 +; NO_SVE-NEXT: .LBB40_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB40_35 +; NO_SVE-NEXT: b .LBB40_36 +; NO_SVE-NEXT: .LBB40_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB40_22 +; NO_SVE-NEXT: .LBB40_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB40_23 +; NO_SVE-NEXT: .LBB40_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB40_24 +; NO_SVE-NEXT: .LBB40_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB40_25 +; NO_SVE-NEXT: .LBB40_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB40_26 +; NO_SVE-NEXT: .LBB40_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB40_27 +; NO_SVE-NEXT: .LBB40_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB40_36 +; NO_SVE-NEXT: .LBB40_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB40_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB40_45 +; NO_SVE-NEXT: .LBB40_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB40_46 +; NO_SVE-NEXT: .LBB40_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB40_47 +; NO_SVE-NEXT: .LBB40_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB40_48 +; NO_SVE-NEXT: .LBB40_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB40_49 +; NO_SVE-NEXT: .LBB40_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB40_50 +; NO_SVE-NEXT: .LBB40_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB40_51 +; NO_SVE-NEXT: b .LBB40_52 +; NO_SVE-NEXT: .LBB40_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB40_38 +; NO_SVE-NEXT: .LBB40_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB40_39 +; NO_SVE-NEXT: .LBB40_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB40_40 +; NO_SVE-NEXT: .LBB40_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB40_41 +; NO_SVE-NEXT: .LBB40_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB40_42 +; NO_SVE-NEXT: .LBB40_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB40_43 +; NO_SVE-NEXT: .LBB40_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB40_52 +; NO_SVE-NEXT: .LBB40_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB40_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB40_61 +; NO_SVE-NEXT: .LBB40_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB40_62 +; NO_SVE-NEXT: .LBB40_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB40_63 +; NO_SVE-NEXT: .LBB40_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB40_64 +; NO_SVE-NEXT: .LBB40_57: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB40_65 +; NO_SVE-NEXT: .LBB40_58: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB40_66 +; NO_SVE-NEXT: .LBB40_59: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB40_67 +; NO_SVE-NEXT: b .LBB40_68 +; NO_SVE-NEXT: .LBB40_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB40_54 +; NO_SVE-NEXT: .LBB40_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB40_55 +; NO_SVE-NEXT: .LBB40_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB40_56 +; NO_SVE-NEXT: .LBB40_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB40_57 +; NO_SVE-NEXT: .LBB40_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB40_58 +; NO_SVE-NEXT: .LBB40_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB40_59 +; NO_SVE-NEXT: .LBB40_66: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz x9, #33, .LBB40_68 +; NO_SVE-NEXT: .LBB40_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB40_76 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB40_77 +; NO_SVE-NEXT: .LBB40_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB40_78 +; NO_SVE-NEXT: .LBB40_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB40_79 +; NO_SVE-NEXT: .LBB40_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB40_80 +; NO_SVE-NEXT: .LBB40_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB40_81 +; NO_SVE-NEXT: .LBB40_74: // %else116 +; NO_SVE-NEXT: tbz x9, #40, .LBB40_82 +; NO_SVE-NEXT: .LBB40_75: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #41, .LBB40_83 +; NO_SVE-NEXT: b .LBB40_84 +; NO_SVE-NEXT: .LBB40_76: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB40_70 +; NO_SVE-NEXT: .LBB40_77: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB40_71 +; NO_SVE-NEXT: .LBB40_78: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB40_72 +; NO_SVE-NEXT: .LBB40_79: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB40_73 +; NO_SVE-NEXT: .LBB40_80: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB40_74 +; NO_SVE-NEXT: .LBB40_81: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #40, .LBB40_75 +; NO_SVE-NEXT: .LBB40_82: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz x9, #41, .LBB40_84 +; NO_SVE-NEXT: .LBB40_83: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_84: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB40_92 +; NO_SVE-NEXT: // %bb.85: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB40_93 +; NO_SVE-NEXT: .LBB40_86: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB40_94 +; NO_SVE-NEXT: .LBB40_87: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB40_95 +; NO_SVE-NEXT: .LBB40_88: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB40_96 +; NO_SVE-NEXT: .LBB40_89: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB40_97 +; NO_SVE-NEXT: .LBB40_90: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB40_98 +; NO_SVE-NEXT: .LBB40_91: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB40_99 +; NO_SVE-NEXT: b .LBB40_100 +; NO_SVE-NEXT: .LBB40_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB40_86 +; NO_SVE-NEXT: .LBB40_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB40_87 +; NO_SVE-NEXT: .LBB40_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB40_88 +; NO_SVE-NEXT: .LBB40_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB40_89 +; NO_SVE-NEXT: .LBB40_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB40_90 +; NO_SVE-NEXT: .LBB40_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB40_91 +; NO_SVE-NEXT: .LBB40_98: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #49, .LBB40_100 +; NO_SVE-NEXT: .LBB40_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB40_108 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB40_109 +; NO_SVE-NEXT: .LBB40_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB40_110 +; NO_SVE-NEXT: .LBB40_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB40_111 +; NO_SVE-NEXT: .LBB40_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB40_112 +; NO_SVE-NEXT: .LBB40_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB40_113 +; NO_SVE-NEXT: .LBB40_106: // %else164 +; NO_SVE-NEXT: tbz x9, #56, .LBB40_114 +; NO_SVE-NEXT: .LBB40_107: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #57, .LBB40_115 +; NO_SVE-NEXT: b .LBB40_116 +; NO_SVE-NEXT: .LBB40_108: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB40_102 +; NO_SVE-NEXT: .LBB40_109: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB40_103 +; NO_SVE-NEXT: .LBB40_110: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB40_104 +; NO_SVE-NEXT: .LBB40_111: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB40_105 +; NO_SVE-NEXT: .LBB40_112: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB40_106 +; NO_SVE-NEXT: .LBB40_113: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #56, .LBB40_107 +; NO_SVE-NEXT: .LBB40_114: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #57, .LBB40_116 +; NO_SVE-NEXT: .LBB40_115: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_116: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB40_124 +; NO_SVE-NEXT: // %bb.117: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB40_125 +; NO_SVE-NEXT: .LBB40_118: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB40_126 +; NO_SVE-NEXT: .LBB40_119: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB40_127 +; NO_SVE-NEXT: .LBB40_120: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB40_128 +; NO_SVE-NEXT: .LBB40_121: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB40_123 +; NO_SVE-NEXT: .LBB40_122: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #126 +; NO_SVE-NEXT: ld1 { v7.h }[7], [x9] +; NO_SVE-NEXT: .LBB40_123: // %else188 +; NO_SVE-NEXT: sshll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll2 v16.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.4s, v3.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.4s, v4.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v5.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v6.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: sshll2 v0.4s, v7.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v7.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB40_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB40_118 +; NO_SVE-NEXT: .LBB40_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB40_119 +; NO_SVE-NEXT: .LBB40_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB40_120 +; NO_SVE-NEXT: .LBB40_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB40_121 +; NO_SVE-NEXT: .LBB40_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.h }[6], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB40_122 +; NO_SVE-NEXT: b .LBB40_123 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -711,6 +9769,343 @@ } define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w17, v1.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: umov w17, v2.b[0] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v2.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #12 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: bfi w16, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w16, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[6] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w12, w10, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB41_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB41_3 +; NO_SVE-NEXT: b .LBB41_4 +; NO_SVE-NEXT: .LBB41_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB41_4 +; NO_SVE-NEXT: .LBB41_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB41_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB41_13 +; NO_SVE-NEXT: .LBB41_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB41_14 +; NO_SVE-NEXT: .LBB41_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB41_15 +; NO_SVE-NEXT: .LBB41_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB41_16 +; NO_SVE-NEXT: .LBB41_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB41_17 +; NO_SVE-NEXT: .LBB41_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB41_18 +; NO_SVE-NEXT: .LBB41_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB41_19 +; NO_SVE-NEXT: b .LBB41_20 +; NO_SVE-NEXT: .LBB41_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB41_6 +; NO_SVE-NEXT: .LBB41_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB41_7 +; NO_SVE-NEXT: .LBB41_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB41_8 +; NO_SVE-NEXT: .LBB41_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB41_9 +; NO_SVE-NEXT: .LBB41_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB41_10 +; NO_SVE-NEXT: .LBB41_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB41_11 +; NO_SVE-NEXT: .LBB41_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB41_20 +; NO_SVE-NEXT: .LBB41_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB41_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB41_29 +; NO_SVE-NEXT: .LBB41_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB41_30 +; NO_SVE-NEXT: .LBB41_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB41_31 +; NO_SVE-NEXT: .LBB41_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB41_32 +; NO_SVE-NEXT: .LBB41_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB41_33 +; NO_SVE-NEXT: .LBB41_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB41_34 +; NO_SVE-NEXT: .LBB41_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB41_35 +; NO_SVE-NEXT: b .LBB41_36 +; NO_SVE-NEXT: .LBB41_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB41_22 +; NO_SVE-NEXT: .LBB41_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB41_23 +; NO_SVE-NEXT: .LBB41_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB41_24 +; NO_SVE-NEXT: .LBB41_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB41_25 +; NO_SVE-NEXT: .LBB41_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB41_26 +; NO_SVE-NEXT: .LBB41_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB41_27 +; NO_SVE-NEXT: .LBB41_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB41_36 +; NO_SVE-NEXT: .LBB41_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB41_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB41_45 +; NO_SVE-NEXT: .LBB41_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB41_46 +; NO_SVE-NEXT: .LBB41_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB41_47 +; NO_SVE-NEXT: .LBB41_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB41_48 +; NO_SVE-NEXT: .LBB41_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB41_49 +; NO_SVE-NEXT: .LBB41_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB41_50 +; NO_SVE-NEXT: .LBB41_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB41_51 +; NO_SVE-NEXT: b .LBB41_52 +; NO_SVE-NEXT: .LBB41_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB41_38 +; NO_SVE-NEXT: .LBB41_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB41_39 +; NO_SVE-NEXT: .LBB41_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB41_40 +; NO_SVE-NEXT: .LBB41_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB41_41 +; NO_SVE-NEXT: .LBB41_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB41_42 +; NO_SVE-NEXT: .LBB41_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB41_43 +; NO_SVE-NEXT: .LBB41_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB41_52 +; NO_SVE-NEXT: .LBB41_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB41_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB41_61 +; NO_SVE-NEXT: .LBB41_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB41_62 +; NO_SVE-NEXT: .LBB41_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB41_63 +; NO_SVE-NEXT: .LBB41_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB41_64 +; NO_SVE-NEXT: .LBB41_57: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB41_59 +; NO_SVE-NEXT: .LBB41_58: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: .LBB41_59: // %else92 +; NO_SVE-NEXT: sshll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v3.8h, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v3.4h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB41_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB41_54 +; NO_SVE-NEXT: .LBB41_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB41_55 +; NO_SVE-NEXT: .LBB41_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB41_56 +; NO_SVE-NEXT: .LBB41_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB41_57 +; NO_SVE-NEXT: .LBB41_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB41_58 +; NO_SVE-NEXT: b .LBB41_59 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -727,6 +10122,349 @@ } define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x1, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB42_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB42_3 +; NO_SVE-NEXT: b .LBB42_4 +; NO_SVE-NEXT: .LBB42_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB42_4 +; NO_SVE-NEXT: .LBB42_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB42_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB42_9 +; NO_SVE-NEXT: .LBB42_6: // %else8 +; NO_SVE-NEXT: tbz w9, #4, .LBB42_10 +; NO_SVE-NEXT: .LBB42_7: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #5, .LBB42_11 +; NO_SVE-NEXT: b .LBB42_12 +; NO_SVE-NEXT: .LBB42_8: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB42_6 +; NO_SVE-NEXT: .LBB42_9: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #4, .LBB42_7 +; NO_SVE-NEXT: .LBB42_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #5, .LBB42_12 +; NO_SVE-NEXT: .LBB42_11: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_12: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB42_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB42_17 +; NO_SVE-NEXT: .LBB42_14: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB42_18 +; NO_SVE-NEXT: .LBB42_15: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB42_19 +; NO_SVE-NEXT: b .LBB42_20 +; NO_SVE-NEXT: .LBB42_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB42_14 +; NO_SVE-NEXT: .LBB42_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB42_15 +; NO_SVE-NEXT: .LBB42_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #9, .LBB42_20 +; NO_SVE-NEXT: .LBB42_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB42_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB42_25 +; NO_SVE-NEXT: .LBB42_22: // %else32 +; NO_SVE-NEXT: tbz w9, #12, .LBB42_26 +; NO_SVE-NEXT: .LBB42_23: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #13, .LBB42_27 +; NO_SVE-NEXT: b .LBB42_28 +; NO_SVE-NEXT: .LBB42_24: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB42_22 +; NO_SVE-NEXT: .LBB42_25: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #12, .LBB42_23 +; NO_SVE-NEXT: .LBB42_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #13, .LBB42_28 +; NO_SVE-NEXT: .LBB42_27: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_28: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB42_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB42_33 +; NO_SVE-NEXT: .LBB42_30: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB42_34 +; NO_SVE-NEXT: .LBB42_31: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB42_35 +; NO_SVE-NEXT: b .LBB42_36 +; NO_SVE-NEXT: .LBB42_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB42_30 +; NO_SVE-NEXT: .LBB42_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB42_31 +; NO_SVE-NEXT: .LBB42_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #17, .LBB42_36 +; NO_SVE-NEXT: .LBB42_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB42_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB42_41 +; NO_SVE-NEXT: .LBB42_38: // %else56 +; NO_SVE-NEXT: tbz w9, #20, .LBB42_42 +; NO_SVE-NEXT: .LBB42_39: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #21, .LBB42_43 +; NO_SVE-NEXT: b .LBB42_44 +; NO_SVE-NEXT: .LBB42_40: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB42_38 +; NO_SVE-NEXT: .LBB42_41: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #20, .LBB42_39 +; NO_SVE-NEXT: .LBB42_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #21, .LBB42_44 +; NO_SVE-NEXT: .LBB42_43: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_44: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB42_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB42_49 +; NO_SVE-NEXT: .LBB42_46: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB42_50 +; NO_SVE-NEXT: .LBB42_47: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB42_51 +; NO_SVE-NEXT: b .LBB42_52 +; NO_SVE-NEXT: .LBB42_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB42_46 +; NO_SVE-NEXT: .LBB42_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB42_47 +; NO_SVE-NEXT: .LBB42_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w9, #25, .LBB42_52 +; NO_SVE-NEXT: .LBB42_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB42_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB42_57 +; NO_SVE-NEXT: .LBB42_54: // %else80 +; NO_SVE-NEXT: tbz w9, #28, .LBB42_58 +; NO_SVE-NEXT: .LBB42_55: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #29, .LBB42_59 +; NO_SVE-NEXT: b .LBB42_60 +; NO_SVE-NEXT: .LBB42_56: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB42_54 +; NO_SVE-NEXT: .LBB42_57: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #28, .LBB42_55 +; NO_SVE-NEXT: .LBB42_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w9, #29, .LBB42_60 +; NO_SVE-NEXT: .LBB42_59: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_60: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB42_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB42_63 +; NO_SVE-NEXT: .LBB42_62: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB42_63: // %else92 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.2d, v4.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v5.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: sshll2 v0.2d, v7.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v7.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB42_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB42_62 +; NO_SVE-NEXT: b .LBB42_63 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -743,6 +10481,1188 @@ } define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v128i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #48 +; NO_SVE-NEXT: .cfi_def_cfa_offset 48 +; NO_SVE-NEXT: str x19, [sp, #32] // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset w19, -16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w15, w10, w15, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w13, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #16, #16 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w17, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w13, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w9, w10, w12, lsl #14 +; NO_SVE-NEXT: orr w10, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w10, w13, #16, #16 +; NO_SVE-NEXT: bfi x10, x15, #32, #32 +; NO_SVE-NEXT: tbz w10, #0, .LBB43_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w10, #1, .LBB43_3 +; NO_SVE-NEXT: b .LBB43_4 +; NO_SVE-NEXT: .LBB43_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w10, #1, .LBB43_4 +; NO_SVE-NEXT: .LBB43_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_4: // %else2 +; NO_SVE-NEXT: tbnz w10, #2, .LBB43_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w10, #3, .LBB43_21 +; NO_SVE-NEXT: .LBB43_6: // %else8 +; NO_SVE-NEXT: tbnz w10, #4, .LBB43_22 +; NO_SVE-NEXT: .LBB43_7: // %else11 +; NO_SVE-NEXT: tbnz w10, #5, .LBB43_23 +; NO_SVE-NEXT: .LBB43_8: // %else14 +; NO_SVE-NEXT: tbnz w10, #6, .LBB43_24 +; NO_SVE-NEXT: .LBB43_9: // %else17 +; NO_SVE-NEXT: tbnz w10, #7, .LBB43_25 +; NO_SVE-NEXT: .LBB43_10: // %else20 +; NO_SVE-NEXT: tbnz w10, #8, .LBB43_26 +; NO_SVE-NEXT: .LBB43_11: // %else23 +; NO_SVE-NEXT: tbnz w10, #9, .LBB43_27 +; NO_SVE-NEXT: .LBB43_12: // %else26 +; NO_SVE-NEXT: tbnz w10, #10, .LBB43_28 +; NO_SVE-NEXT: .LBB43_13: // %else29 +; NO_SVE-NEXT: tbnz w10, #11, .LBB43_29 +; NO_SVE-NEXT: .LBB43_14: // %else32 +; NO_SVE-NEXT: tbnz w10, #12, .LBB43_30 +; NO_SVE-NEXT: .LBB43_15: // %else35 +; NO_SVE-NEXT: tbnz w10, #13, .LBB43_31 +; NO_SVE-NEXT: .LBB43_16: // %else38 +; NO_SVE-NEXT: tbnz w10, #14, .LBB43_32 +; NO_SVE-NEXT: .LBB43_17: // %else41 +; NO_SVE-NEXT: tbnz w10, #15, .LBB43_33 +; NO_SVE-NEXT: .LBB43_18: // %else44 +; NO_SVE-NEXT: tbz w10, #16, .LBB43_34 +; NO_SVE-NEXT: .LBB43_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w10, #17, .LBB43_35 +; NO_SVE-NEXT: b .LBB43_36 +; NO_SVE-NEXT: .LBB43_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #3, .LBB43_6 +; NO_SVE-NEXT: .LBB43_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #4, .LBB43_7 +; NO_SVE-NEXT: .LBB43_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #5, .LBB43_8 +; NO_SVE-NEXT: .LBB43_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #6, .LBB43_9 +; NO_SVE-NEXT: .LBB43_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #7, .LBB43_10 +; NO_SVE-NEXT: .LBB43_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #8, .LBB43_11 +; NO_SVE-NEXT: .LBB43_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #9, .LBB43_12 +; NO_SVE-NEXT: .LBB43_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #10, .LBB43_13 +; NO_SVE-NEXT: .LBB43_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #11, .LBB43_14 +; NO_SVE-NEXT: .LBB43_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #12, .LBB43_15 +; NO_SVE-NEXT: .LBB43_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #13, .LBB43_16 +; NO_SVE-NEXT: .LBB43_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #14, .LBB43_17 +; NO_SVE-NEXT: .LBB43_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #15, .LBB43_18 +; NO_SVE-NEXT: .LBB43_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w10, #16, .LBB43_19 +; NO_SVE-NEXT: .LBB43_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w10, #17, .LBB43_36 +; NO_SVE-NEXT: .LBB43_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_36: // %else50 +; NO_SVE-NEXT: tbnz w10, #18, .LBB43_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w10, #19, .LBB43_53 +; NO_SVE-NEXT: .LBB43_38: // %else56 +; NO_SVE-NEXT: tbnz w10, #20, .LBB43_54 +; NO_SVE-NEXT: .LBB43_39: // %else59 +; NO_SVE-NEXT: tbnz w10, #21, .LBB43_55 +; NO_SVE-NEXT: .LBB43_40: // %else62 +; NO_SVE-NEXT: tbnz w10, #22, .LBB43_56 +; NO_SVE-NEXT: .LBB43_41: // %else65 +; NO_SVE-NEXT: tbnz w10, #23, .LBB43_57 +; NO_SVE-NEXT: .LBB43_42: // %else68 +; NO_SVE-NEXT: tbnz w10, #24, .LBB43_58 +; NO_SVE-NEXT: .LBB43_43: // %else71 +; NO_SVE-NEXT: tbnz w10, #25, .LBB43_59 +; NO_SVE-NEXT: .LBB43_44: // %else74 +; NO_SVE-NEXT: tbnz w10, #26, .LBB43_60 +; NO_SVE-NEXT: .LBB43_45: // %else77 +; NO_SVE-NEXT: tbnz w10, #27, .LBB43_61 +; NO_SVE-NEXT: .LBB43_46: // %else80 +; NO_SVE-NEXT: tbnz w10, #28, .LBB43_62 +; NO_SVE-NEXT: .LBB43_47: // %else83 +; NO_SVE-NEXT: tbnz w10, #29, .LBB43_63 +; NO_SVE-NEXT: .LBB43_48: // %else86 +; NO_SVE-NEXT: tbnz w10, #30, .LBB43_64 +; NO_SVE-NEXT: .LBB43_49: // %else89 +; NO_SVE-NEXT: tbnz w10, #31, .LBB43_65 +; NO_SVE-NEXT: .LBB43_50: // %else92 +; NO_SVE-NEXT: tbz x10, #32, .LBB43_66 +; NO_SVE-NEXT: .LBB43_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x10, #33, .LBB43_67 +; NO_SVE-NEXT: b .LBB43_68 +; NO_SVE-NEXT: .LBB43_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #19, .LBB43_38 +; NO_SVE-NEXT: .LBB43_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #20, .LBB43_39 +; NO_SVE-NEXT: .LBB43_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #21, .LBB43_40 +; NO_SVE-NEXT: .LBB43_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #22, .LBB43_41 +; NO_SVE-NEXT: .LBB43_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #23, .LBB43_42 +; NO_SVE-NEXT: .LBB43_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #24, .LBB43_43 +; NO_SVE-NEXT: .LBB43_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #25, .LBB43_44 +; NO_SVE-NEXT: .LBB43_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #26, .LBB43_45 +; NO_SVE-NEXT: .LBB43_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #27, .LBB43_46 +; NO_SVE-NEXT: .LBB43_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #28, .LBB43_47 +; NO_SVE-NEXT: .LBB43_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #29, .LBB43_48 +; NO_SVE-NEXT: .LBB43_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #30, .LBB43_49 +; NO_SVE-NEXT: .LBB43_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #31, .LBB43_50 +; NO_SVE-NEXT: .LBB43_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x10, #32, .LBB43_51 +; NO_SVE-NEXT: .LBB43_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x10, #33, .LBB43_68 +; NO_SVE-NEXT: .LBB43_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_68: // %else98 +; NO_SVE-NEXT: tbnz x10, #34, .LBB43_91 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x10, #35, .LBB43_92 +; NO_SVE-NEXT: .LBB43_70: // %else104 +; NO_SVE-NEXT: tbnz x10, #36, .LBB43_93 +; NO_SVE-NEXT: .LBB43_71: // %else107 +; NO_SVE-NEXT: tbnz x10, #37, .LBB43_94 +; NO_SVE-NEXT: .LBB43_72: // %else110 +; NO_SVE-NEXT: tbnz x10, #38, .LBB43_95 +; NO_SVE-NEXT: .LBB43_73: // %else113 +; NO_SVE-NEXT: tbnz x10, #39, .LBB43_96 +; NO_SVE-NEXT: .LBB43_74: // %else116 +; NO_SVE-NEXT: tbnz x10, #40, .LBB43_97 +; NO_SVE-NEXT: .LBB43_75: // %else119 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: tbz x10, #41, .LBB43_77 +; NO_SVE-NEXT: .LBB43_76: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: .LBB43_77: // %else122 +; NO_SVE-NEXT: cmeq v7.16b, v4.16b, #0 +; NO_SVE-NEXT: ldp q3, q4, [x1, #96] +; NO_SVE-NEXT: cmeq v5.16b, v5.16b, #0 +; NO_SVE-NEXT: tbz x10, #42, .LBB43_79 +; NO_SVE-NEXT: // %bb.78: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: .LBB43_79: // %else125 +; NO_SVE-NEXT: umov w13, v7.b[1] +; NO_SVE-NEXT: umov w16, v7.b[0] +; NO_SVE-NEXT: umov w9, v5.b[1] +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: cmeq v6.16b, v4.16b, #0 +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, #0 +; NO_SVE-NEXT: tbz x10, #43, .LBB43_81 +; NO_SVE-NEXT: // %bb.80: // %cond.load127 +; NO_SVE-NEXT: add x11, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x11] +; NO_SVE-NEXT: .LBB43_81: // %else128 +; NO_SVE-NEXT: umov w15, v6.b[1] +; NO_SVE-NEXT: umov w18, v6.b[0] +; NO_SVE-NEXT: umov w11, v4.b[1] +; NO_SVE-NEXT: umov w1, v4.b[0] +; NO_SVE-NEXT: and w14, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w3, v7.b[2] +; NO_SVE-NEXT: umov w16, v5.b[2] +; NO_SVE-NEXT: and w2, w9, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: tbz x10, #44, .LBB43_83 +; NO_SVE-NEXT: // %bb.82: // %cond.load130 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: .LBB43_83: // %else131 +; NO_SVE-NEXT: and w17, w15, #0x1 +; NO_SVE-NEXT: and w9, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[2] +; NO_SVE-NEXT: and w15, w11, #0x1 +; NO_SVE-NEXT: and w11, w1, #0x1 +; NO_SVE-NEXT: umov w1, v4.b[2] +; NO_SVE-NEXT: bfi w13, w14, #1, #1 +; NO_SVE-NEXT: umov w4, v7.b[3] +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w12, w2, #1, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: tbz x10, #45, .LBB43_85 +; NO_SVE-NEXT: // %bb.84: // %cond.load133 +; NO_SVE-NEXT: add x2, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x2] +; NO_SVE-NEXT: .LBB43_85: // %else134 +; NO_SVE-NEXT: bfi w9, w17, #1, #1 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[3] +; NO_SVE-NEXT: bfi w11, w15, #1, #1 +; NO_SVE-NEXT: umov w2, v4.b[3] +; NO_SVE-NEXT: bfi w13, w3, #2, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[4] +; NO_SVE-NEXT: umov w15, v5.b[4] +; NO_SVE-NEXT: and w1, w1, #0x1 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #46, .LBB43_87 +; NO_SVE-NEXT: // %bb.86: // %cond.load136 +; NO_SVE-NEXT: add x16, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x16] +; NO_SVE-NEXT: .LBB43_87: // %else137 +; NO_SVE-NEXT: bfi w9, w17, #2, #1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w17, v6.b[4] +; NO_SVE-NEXT: bfi w11, w1, #2, #1 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[4] +; NO_SVE-NEXT: umov w5, v7.b[5] +; NO_SVE-NEXT: umov w1, v5.b[5] +; NO_SVE-NEXT: bfi w13, w3, #3, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: and w4, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #47, .LBB43_89 +; NO_SVE-NEXT: // %bb.88: // %cond.load139 +; NO_SVE-NEXT: add x14, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x14] +; NO_SVE-NEXT: .LBB43_89: // %else140 +; NO_SVE-NEXT: bfi w9, w16, #3, #1 +; NO_SVE-NEXT: umov w16, v6.b[5] +; NO_SVE-NEXT: bfi w11, w18, #3, #1 +; NO_SVE-NEXT: umov w18, v4.b[5] +; NO_SVE-NEXT: bfi w13, w3, #4, #1 +; NO_SVE-NEXT: umov w3, v7.b[6] +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: and w2, w5, #0x1 +; NO_SVE-NEXT: bfi w12, w4, #4, #1 +; NO_SVE-NEXT: and w1, w1, #0x1 +; NO_SVE-NEXT: tbz x10, #48, .LBB43_98 +; NO_SVE-NEXT: // %bb.90: // %cond.load142 +; NO_SVE-NEXT: add x4, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x4] +; NO_SVE-NEXT: b .LBB43_99 +; NO_SVE-NEXT: .LBB43_91: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x10, #35, .LBB43_70 +; NO_SVE-NEXT: .LBB43_92: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x10, #36, .LBB43_71 +; NO_SVE-NEXT: .LBB43_93: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x10, #37, .LBB43_72 +; NO_SVE-NEXT: .LBB43_94: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x10, #38, .LBB43_73 +; NO_SVE-NEXT: .LBB43_95: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x10, #39, .LBB43_74 +; NO_SVE-NEXT: .LBB43_96: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x10, #40, .LBB43_75 +; NO_SVE-NEXT: .LBB43_97: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: tbnz x10, #41, .LBB43_76 +; NO_SVE-NEXT: b .LBB43_77 +; NO_SVE-NEXT: .LBB43_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB43_99: // %else143 +; NO_SVE-NEXT: bfi w9, w15, #4, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v6.b[6] +; NO_SVE-NEXT: bfi w11, w17, #4, #1 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v4.b[6] +; NO_SVE-NEXT: bfi w13, w2, #5, #1 +; NO_SVE-NEXT: umov w4, v7.b[7] +; NO_SVE-NEXT: umov w2, v5.b[7] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w12, w1, #5, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #49, .LBB43_101 +; NO_SVE-NEXT: // %bb.100: // %cond.load145 +; NO_SVE-NEXT: add x1, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x1] +; NO_SVE-NEXT: .LBB43_101: // %else146 +; NO_SVE-NEXT: bfi w9, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: bfi w11, w17, #5, #1 +; NO_SVE-NEXT: umov w6, v4.b[7] +; NO_SVE-NEXT: orr w17, w13, w3, lsl #6 +; NO_SVE-NEXT: umov w19, v7.b[8] +; NO_SVE-NEXT: umov w13, v5.b[8] +; NO_SVE-NEXT: and w5, w18, #0x1 +; NO_SVE-NEXT: and w7, w4, #0x1 +; NO_SVE-NEXT: orr w18, w12, w14, lsl #6 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: tbz x10, #50, .LBB43_103 +; NO_SVE-NEXT: // %bb.102: // %cond.load148 +; NO_SVE-NEXT: add x12, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x12] +; NO_SVE-NEXT: .LBB43_103: // %else149 +; NO_SVE-NEXT: orr w12, w9, w15, lsl #6 +; NO_SVE-NEXT: umov w3, v6.b[8] +; NO_SVE-NEXT: orr w11, w11, w5, lsl #6 +; NO_SVE-NEXT: umov w5, v4.b[8] +; NO_SVE-NEXT: orr w14, w17, w7, lsl #7 +; NO_SVE-NEXT: umov w7, v7.b[9] +; NO_SVE-NEXT: umov w9, v5.b[9] +; NO_SVE-NEXT: and w2, w16, #0x1 +; NO_SVE-NEXT: and w4, w6, #0x1 +; NO_SVE-NEXT: and w6, w19, #0x1 +; NO_SVE-NEXT: orr w15, w18, w1, lsl #7 +; NO_SVE-NEXT: and w16, w13, #0x1 +; NO_SVE-NEXT: tbz x10, #51, .LBB43_105 +; NO_SVE-NEXT: // %bb.104: // %cond.load151 +; NO_SVE-NEXT: add x13, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x13] +; NO_SVE-NEXT: .LBB43_105: // %else152 +; NO_SVE-NEXT: orr w12, w12, w2, lsl #7 +; NO_SVE-NEXT: umov w18, v6.b[9] +; NO_SVE-NEXT: orr w13, w11, w4, lsl #7 +; NO_SVE-NEXT: umov w2, v4.b[9] +; NO_SVE-NEXT: umov w4, v7.b[10] +; NO_SVE-NEXT: umov w11, v5.b[10] +; NO_SVE-NEXT: and w17, w3, #0x1 +; NO_SVE-NEXT: and w1, w5, #0x1 +; NO_SVE-NEXT: orr w14, w14, w6, lsl #8 +; NO_SVE-NEXT: and w3, w7, #0x1 +; NO_SVE-NEXT: orr w15, w15, w16, lsl #8 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #52, .LBB43_107 +; NO_SVE-NEXT: // %bb.106: // %cond.load154 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x9] +; NO_SVE-NEXT: .LBB43_107: // %else155 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #8 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[10] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #8 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[10] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #9 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[11] +; NO_SVE-NEXT: umov w9, v5.b[11] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #9 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #53, .LBB43_109 +; NO_SVE-NEXT: // %bb.108: // %cond.load157 +; NO_SVE-NEXT: add x11, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x11] +; NO_SVE-NEXT: .LBB43_109: // %else158 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #9 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[11] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #9 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[11] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #10 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[12] +; NO_SVE-NEXT: umov w11, v5.b[12] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #10 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #54, .LBB43_111 +; NO_SVE-NEXT: // %bb.110: // %cond.load160 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x9] +; NO_SVE-NEXT: .LBB43_111: // %else161 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #10 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[12] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #10 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[12] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #11 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v7.b[13] +; NO_SVE-NEXT: umov w9, v5.b[13] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #11 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #55, .LBB43_113 +; NO_SVE-NEXT: // %bb.112: // %cond.load163 +; NO_SVE-NEXT: add x11, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x11] +; NO_SVE-NEXT: .LBB43_113: // %else164 +; NO_SVE-NEXT: orr w11, w12, w17, lsl #11 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[13] +; NO_SVE-NEXT: orr w12, w13, w1, lsl #11 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[13] +; NO_SVE-NEXT: orr w13, w14, w3, lsl #12 +; NO_SVE-NEXT: and w14, w4, #0x1 +; NO_SVE-NEXT: umov w3, v7.b[14] +; NO_SVE-NEXT: umov w4, v5.b[14] +; NO_SVE-NEXT: orr w5, w15, w16, lsl #12 +; NO_SVE-NEXT: and w6, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #56, .LBB43_115 +; NO_SVE-NEXT: // %bb.114: // %cond.load166 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x9] +; NO_SVE-NEXT: .LBB43_115: // %else167 +; NO_SVE-NEXT: orr w15, w11, w17, lsl #12 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w17, v6.b[14] +; NO_SVE-NEXT: orr w18, w12, w1, lsl #12 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v4.b[14] +; NO_SVE-NEXT: orr w9, w13, w14, lsl #13 +; NO_SVE-NEXT: and w11, w3, #0x1 +; NO_SVE-NEXT: orr w12, w5, w6, lsl #13 +; NO_SVE-NEXT: and w13, w4, #0x1 +; NO_SVE-NEXT: tbz x10, #57, .LBB43_117 +; NO_SVE-NEXT: // %bb.116: // %cond.load169 +; NO_SVE-NEXT: add x14, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x14] +; NO_SVE-NEXT: .LBB43_117: // %else170 +; NO_SVE-NEXT: orr w14, w15, w16, lsl #13 +; NO_SVE-NEXT: orr w15, w18, w1, lsl #13 +; NO_SVE-NEXT: umov w18, v7.b[15] +; NO_SVE-NEXT: umov w1, v5.b[15] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: orr w2, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: tbz x10, #58, .LBB43_119 +; NO_SVE-NEXT: // %bb.118: // %cond.load172 +; NO_SVE-NEXT: add x11, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x11] +; NO_SVE-NEXT: .LBB43_119: // %else173 +; NO_SVE-NEXT: umov w12, v6.b[15] +; NO_SVE-NEXT: orr w13, w15, w17, lsl #14 +; NO_SVE-NEXT: umov w15, v4.b[15] +; NO_SVE-NEXT: orr w11, w14, w16, lsl #14 +; NO_SVE-NEXT: orr w14, w2, w18, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w1, lsl #15 +; NO_SVE-NEXT: tbz x10, #59, .LBB43_121 +; NO_SVE-NEXT: // %bb.120: // %cond.load175 +; NO_SVE-NEXT: add x16, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x16] +; NO_SVE-NEXT: .LBB43_121: // %else176 +; NO_SVE-NEXT: orr w12, w11, w12, lsl #15 +; NO_SVE-NEXT: orr w11, w13, w15, lsl #15 +; NO_SVE-NEXT: bfi w9, w14, #16, #16 +; NO_SVE-NEXT: tbnz x10, #60, .LBB43_127 +; NO_SVE-NEXT: // %bb.122: // %else179 +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: tbnz x10, #61, .LBB43_128 +; NO_SVE-NEXT: .LBB43_123: // %else182 +; NO_SVE-NEXT: tbnz x10, #62, .LBB43_129 +; NO_SVE-NEXT: .LBB43_124: // %else185 +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbnz x10, #63, .LBB43_130 +; NO_SVE-NEXT: .LBB43_125: // %else188 +; NO_SVE-NEXT: tbz w9, #0, .LBB43_131 +; NO_SVE-NEXT: .LBB43_126: // %cond.load190 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #1, .LBB43_132 +; NO_SVE-NEXT: b .LBB43_133 +; NO_SVE-NEXT: .LBB43_127: // %cond.load178 +; NO_SVE-NEXT: add x13, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x13] +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: tbz x10, #61, .LBB43_123 +; NO_SVE-NEXT: .LBB43_128: // %cond.load181 +; NO_SVE-NEXT: add x12, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x12] +; NO_SVE-NEXT: tbz x10, #62, .LBB43_124 +; NO_SVE-NEXT: .LBB43_129: // %cond.load184 +; NO_SVE-NEXT: add x12, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x12] +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbz x10, #63, .LBB43_125 +; NO_SVE-NEXT: .LBB43_130: // %cond.load187 +; NO_SVE-NEXT: add x10, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #0, .LBB43_126 +; NO_SVE-NEXT: .LBB43_131: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #1, .LBB43_133 +; NO_SVE-NEXT: .LBB43_132: // %cond.load193 +; NO_SVE-NEXT: add x10, x0, #65 +; NO_SVE-NEXT: ld1 { v4.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_133: // %else194 +; NO_SVE-NEXT: tbnz w9, #2, .LBB43_149 +; NO_SVE-NEXT: // %bb.134: // %else197 +; NO_SVE-NEXT: tbnz w9, #3, .LBB43_150 +; NO_SVE-NEXT: .LBB43_135: // %else200 +; NO_SVE-NEXT: tbnz w9, #4, .LBB43_151 +; NO_SVE-NEXT: .LBB43_136: // %else203 +; NO_SVE-NEXT: tbnz w9, #5, .LBB43_152 +; NO_SVE-NEXT: .LBB43_137: // %else206 +; NO_SVE-NEXT: tbnz w9, #6, .LBB43_153 +; NO_SVE-NEXT: .LBB43_138: // %else209 +; NO_SVE-NEXT: tbnz w9, #7, .LBB43_154 +; NO_SVE-NEXT: .LBB43_139: // %else212 +; NO_SVE-NEXT: tbnz w9, #8, .LBB43_155 +; NO_SVE-NEXT: .LBB43_140: // %else215 +; NO_SVE-NEXT: tbnz w9, #9, .LBB43_156 +; NO_SVE-NEXT: .LBB43_141: // %else218 +; NO_SVE-NEXT: tbnz w9, #10, .LBB43_157 +; NO_SVE-NEXT: .LBB43_142: // %else221 +; NO_SVE-NEXT: tbnz w9, #11, .LBB43_158 +; NO_SVE-NEXT: .LBB43_143: // %else224 +; NO_SVE-NEXT: tbnz w9, #12, .LBB43_159 +; NO_SVE-NEXT: .LBB43_144: // %else227 +; NO_SVE-NEXT: tbnz w9, #13, .LBB43_160 +; NO_SVE-NEXT: .LBB43_145: // %else230 +; NO_SVE-NEXT: tbnz w9, #14, .LBB43_161 +; NO_SVE-NEXT: .LBB43_146: // %else233 +; NO_SVE-NEXT: tbnz w9, #15, .LBB43_162 +; NO_SVE-NEXT: .LBB43_147: // %else236 +; NO_SVE-NEXT: tbz w9, #16, .LBB43_163 +; NO_SVE-NEXT: .LBB43_148: // %cond.load238 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB43_164 +; NO_SVE-NEXT: b .LBB43_165 +; NO_SVE-NEXT: .LBB43_149: // %cond.load196 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB43_135 +; NO_SVE-NEXT: .LBB43_150: // %cond.load199 +; NO_SVE-NEXT: add x10, x0, #67 +; NO_SVE-NEXT: ld1 { v4.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB43_136 +; NO_SVE-NEXT: .LBB43_151: // %cond.load202 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB43_137 +; NO_SVE-NEXT: .LBB43_152: // %cond.load205 +; NO_SVE-NEXT: add x10, x0, #69 +; NO_SVE-NEXT: ld1 { v4.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB43_138 +; NO_SVE-NEXT: .LBB43_153: // %cond.load208 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB43_139 +; NO_SVE-NEXT: .LBB43_154: // %cond.load211 +; NO_SVE-NEXT: add x10, x0, #71 +; NO_SVE-NEXT: ld1 { v4.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB43_140 +; NO_SVE-NEXT: .LBB43_155: // %cond.load214 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB43_141 +; NO_SVE-NEXT: .LBB43_156: // %cond.load217 +; NO_SVE-NEXT: add x10, x0, #73 +; NO_SVE-NEXT: ld1 { v4.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB43_142 +; NO_SVE-NEXT: .LBB43_157: // %cond.load220 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB43_143 +; NO_SVE-NEXT: .LBB43_158: // %cond.load223 +; NO_SVE-NEXT: add x10, x0, #75 +; NO_SVE-NEXT: ld1 { v4.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB43_144 +; NO_SVE-NEXT: .LBB43_159: // %cond.load226 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB43_145 +; NO_SVE-NEXT: .LBB43_160: // %cond.load229 +; NO_SVE-NEXT: add x10, x0, #77 +; NO_SVE-NEXT: ld1 { v4.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB43_146 +; NO_SVE-NEXT: .LBB43_161: // %cond.load232 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB43_147 +; NO_SVE-NEXT: .LBB43_162: // %cond.load235 +; NO_SVE-NEXT: add x10, x0, #79 +; NO_SVE-NEXT: ld1 { v4.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB43_148 +; NO_SVE-NEXT: .LBB43_163: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #17, .LBB43_165 +; NO_SVE-NEXT: .LBB43_164: // %cond.load241 +; NO_SVE-NEXT: add x10, x0, #81 +; NO_SVE-NEXT: ld1 { v5.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_165: // %else242 +; NO_SVE-NEXT: tbnz w9, #18, .LBB43_181 +; NO_SVE-NEXT: // %bb.166: // %else245 +; NO_SVE-NEXT: tbnz w9, #19, .LBB43_182 +; NO_SVE-NEXT: .LBB43_167: // %else248 +; NO_SVE-NEXT: tbnz w9, #20, .LBB43_183 +; NO_SVE-NEXT: .LBB43_168: // %else251 +; NO_SVE-NEXT: tbnz w9, #21, .LBB43_184 +; NO_SVE-NEXT: .LBB43_169: // %else254 +; NO_SVE-NEXT: tbnz w9, #22, .LBB43_185 +; NO_SVE-NEXT: .LBB43_170: // %else257 +; NO_SVE-NEXT: tbnz w9, #23, .LBB43_186 +; NO_SVE-NEXT: .LBB43_171: // %else260 +; NO_SVE-NEXT: tbnz w9, #24, .LBB43_187 +; NO_SVE-NEXT: .LBB43_172: // %else263 +; NO_SVE-NEXT: tbnz w9, #25, .LBB43_188 +; NO_SVE-NEXT: .LBB43_173: // %else266 +; NO_SVE-NEXT: tbnz w9, #26, .LBB43_189 +; NO_SVE-NEXT: .LBB43_174: // %else269 +; NO_SVE-NEXT: tbnz w9, #27, .LBB43_190 +; NO_SVE-NEXT: .LBB43_175: // %else272 +; NO_SVE-NEXT: tbnz w9, #28, .LBB43_191 +; NO_SVE-NEXT: .LBB43_176: // %else275 +; NO_SVE-NEXT: tbnz w9, #29, .LBB43_192 +; NO_SVE-NEXT: .LBB43_177: // %else278 +; NO_SVE-NEXT: tbnz w9, #30, .LBB43_193 +; NO_SVE-NEXT: .LBB43_178: // %else281 +; NO_SVE-NEXT: tbnz w9, #31, .LBB43_194 +; NO_SVE-NEXT: .LBB43_179: // %else284 +; NO_SVE-NEXT: tbz x9, #32, .LBB43_195 +; NO_SVE-NEXT: .LBB43_180: // %cond.load286 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB43_196 +; NO_SVE-NEXT: b .LBB43_197 +; NO_SVE-NEXT: .LBB43_181: // %cond.load244 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB43_167 +; NO_SVE-NEXT: .LBB43_182: // %cond.load247 +; NO_SVE-NEXT: add x10, x0, #83 +; NO_SVE-NEXT: ld1 { v5.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB43_168 +; NO_SVE-NEXT: .LBB43_183: // %cond.load250 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB43_169 +; NO_SVE-NEXT: .LBB43_184: // %cond.load253 +; NO_SVE-NEXT: add x10, x0, #85 +; NO_SVE-NEXT: ld1 { v5.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB43_170 +; NO_SVE-NEXT: .LBB43_185: // %cond.load256 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB43_171 +; NO_SVE-NEXT: .LBB43_186: // %cond.load259 +; NO_SVE-NEXT: add x10, x0, #87 +; NO_SVE-NEXT: ld1 { v5.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB43_172 +; NO_SVE-NEXT: .LBB43_187: // %cond.load262 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB43_173 +; NO_SVE-NEXT: .LBB43_188: // %cond.load265 +; NO_SVE-NEXT: add x10, x0, #89 +; NO_SVE-NEXT: ld1 { v5.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB43_174 +; NO_SVE-NEXT: .LBB43_189: // %cond.load268 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB43_175 +; NO_SVE-NEXT: .LBB43_190: // %cond.load271 +; NO_SVE-NEXT: add x10, x0, #91 +; NO_SVE-NEXT: ld1 { v5.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB43_176 +; NO_SVE-NEXT: .LBB43_191: // %cond.load274 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB43_177 +; NO_SVE-NEXT: .LBB43_192: // %cond.load277 +; NO_SVE-NEXT: add x10, x0, #93 +; NO_SVE-NEXT: ld1 { v5.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB43_178 +; NO_SVE-NEXT: .LBB43_193: // %cond.load280 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB43_179 +; NO_SVE-NEXT: .LBB43_194: // %cond.load283 +; NO_SVE-NEXT: add x10, x0, #95 +; NO_SVE-NEXT: ld1 { v5.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB43_180 +; NO_SVE-NEXT: .LBB43_195: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #33, .LBB43_197 +; NO_SVE-NEXT: .LBB43_196: // %cond.load289 +; NO_SVE-NEXT: add x10, x0, #97 +; NO_SVE-NEXT: ld1 { v6.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_197: // %else290 +; NO_SVE-NEXT: tbnz x9, #34, .LBB43_213 +; NO_SVE-NEXT: // %bb.198: // %else293 +; NO_SVE-NEXT: tbnz x9, #35, .LBB43_214 +; NO_SVE-NEXT: .LBB43_199: // %else296 +; NO_SVE-NEXT: tbnz x9, #36, .LBB43_215 +; NO_SVE-NEXT: .LBB43_200: // %else299 +; NO_SVE-NEXT: tbnz x9, #37, .LBB43_216 +; NO_SVE-NEXT: .LBB43_201: // %else302 +; NO_SVE-NEXT: tbnz x9, #38, .LBB43_217 +; NO_SVE-NEXT: .LBB43_202: // %else305 +; NO_SVE-NEXT: tbnz x9, #39, .LBB43_218 +; NO_SVE-NEXT: .LBB43_203: // %else308 +; NO_SVE-NEXT: tbnz x9, #40, .LBB43_219 +; NO_SVE-NEXT: .LBB43_204: // %else311 +; NO_SVE-NEXT: tbnz x9, #41, .LBB43_220 +; NO_SVE-NEXT: .LBB43_205: // %else314 +; NO_SVE-NEXT: tbnz x9, #42, .LBB43_221 +; NO_SVE-NEXT: .LBB43_206: // %else317 +; NO_SVE-NEXT: tbnz x9, #43, .LBB43_222 +; NO_SVE-NEXT: .LBB43_207: // %else320 +; NO_SVE-NEXT: tbnz x9, #44, .LBB43_223 +; NO_SVE-NEXT: .LBB43_208: // %else323 +; NO_SVE-NEXT: tbnz x9, #45, .LBB43_224 +; NO_SVE-NEXT: .LBB43_209: // %else326 +; NO_SVE-NEXT: tbnz x9, #46, .LBB43_225 +; NO_SVE-NEXT: .LBB43_210: // %else329 +; NO_SVE-NEXT: tbnz x9, #47, .LBB43_226 +; NO_SVE-NEXT: .LBB43_211: // %else332 +; NO_SVE-NEXT: tbz x9, #48, .LBB43_227 +; NO_SVE-NEXT: .LBB43_212: // %cond.load334 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB43_228 +; NO_SVE-NEXT: b .LBB43_229 +; NO_SVE-NEXT: .LBB43_213: // %cond.load292 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB43_199 +; NO_SVE-NEXT: .LBB43_214: // %cond.load295 +; NO_SVE-NEXT: add x10, x0, #99 +; NO_SVE-NEXT: ld1 { v6.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB43_200 +; NO_SVE-NEXT: .LBB43_215: // %cond.load298 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB43_201 +; NO_SVE-NEXT: .LBB43_216: // %cond.load301 +; NO_SVE-NEXT: add x10, x0, #101 +; NO_SVE-NEXT: ld1 { v6.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB43_202 +; NO_SVE-NEXT: .LBB43_217: // %cond.load304 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB43_203 +; NO_SVE-NEXT: .LBB43_218: // %cond.load307 +; NO_SVE-NEXT: add x10, x0, #103 +; NO_SVE-NEXT: ld1 { v6.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB43_204 +; NO_SVE-NEXT: .LBB43_219: // %cond.load310 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB43_205 +; NO_SVE-NEXT: .LBB43_220: // %cond.load313 +; NO_SVE-NEXT: add x10, x0, #105 +; NO_SVE-NEXT: ld1 { v6.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB43_206 +; NO_SVE-NEXT: .LBB43_221: // %cond.load316 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB43_207 +; NO_SVE-NEXT: .LBB43_222: // %cond.load319 +; NO_SVE-NEXT: add x10, x0, #107 +; NO_SVE-NEXT: ld1 { v6.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB43_208 +; NO_SVE-NEXT: .LBB43_223: // %cond.load322 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB43_209 +; NO_SVE-NEXT: .LBB43_224: // %cond.load325 +; NO_SVE-NEXT: add x10, x0, #109 +; NO_SVE-NEXT: ld1 { v6.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB43_210 +; NO_SVE-NEXT: .LBB43_225: // %cond.load328 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB43_211 +; NO_SVE-NEXT: .LBB43_226: // %cond.load331 +; NO_SVE-NEXT: add x10, x0, #111 +; NO_SVE-NEXT: ld1 { v6.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB43_212 +; NO_SVE-NEXT: .LBB43_227: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #49, .LBB43_229 +; NO_SVE-NEXT: .LBB43_228: // %cond.load337 +; NO_SVE-NEXT: add x10, x0, #113 +; NO_SVE-NEXT: ld1 { v7.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_229: // %else338 +; NO_SVE-NEXT: tbnz x9, #50, .LBB43_245 +; NO_SVE-NEXT: // %bb.230: // %else341 +; NO_SVE-NEXT: tbnz x9, #51, .LBB43_246 +; NO_SVE-NEXT: .LBB43_231: // %else344 +; NO_SVE-NEXT: tbnz x9, #52, .LBB43_247 +; NO_SVE-NEXT: .LBB43_232: // %else347 +; NO_SVE-NEXT: tbnz x9, #53, .LBB43_248 +; NO_SVE-NEXT: .LBB43_233: // %else350 +; NO_SVE-NEXT: tbnz x9, #54, .LBB43_249 +; NO_SVE-NEXT: .LBB43_234: // %else353 +; NO_SVE-NEXT: tbnz x9, #55, .LBB43_250 +; NO_SVE-NEXT: .LBB43_235: // %else356 +; NO_SVE-NEXT: tbnz x9, #56, .LBB43_251 +; NO_SVE-NEXT: .LBB43_236: // %else359 +; NO_SVE-NEXT: tbnz x9, #57, .LBB43_252 +; NO_SVE-NEXT: .LBB43_237: // %else362 +; NO_SVE-NEXT: tbnz x9, #58, .LBB43_253 +; NO_SVE-NEXT: .LBB43_238: // %else365 +; NO_SVE-NEXT: tbnz x9, #59, .LBB43_254 +; NO_SVE-NEXT: .LBB43_239: // %else368 +; NO_SVE-NEXT: tbnz x9, #60, .LBB43_255 +; NO_SVE-NEXT: .LBB43_240: // %else371 +; NO_SVE-NEXT: tbnz x9, #61, .LBB43_256 +; NO_SVE-NEXT: .LBB43_241: // %else374 +; NO_SVE-NEXT: tbnz x9, #62, .LBB43_257 +; NO_SVE-NEXT: .LBB43_242: // %else377 +; NO_SVE-NEXT: tbz x9, #63, .LBB43_244 +; NO_SVE-NEXT: .LBB43_243: // %cond.load379 +; NO_SVE-NEXT: add x9, x0, #127 +; NO_SVE-NEXT: ld1 { v7.b }[15], [x9] +; NO_SVE-NEXT: .LBB43_244: // %else380 +; NO_SVE-NEXT: ushll2 v16.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v17.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #32] +; NO_SVE-NEXT: ushll2 v1.8h, v3.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: ushll v0.8h, v3.8b, #0 +; NO_SVE-NEXT: ushll2 v2.8h, v4.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #96] +; NO_SVE-NEXT: ushll v1.8h, v4.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v5.16b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #128] +; NO_SVE-NEXT: ushll v2.8h, v5.8b, #0 +; NO_SVE-NEXT: ushll2 v1.8h, v6.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.8h, v6.8b, #0 +; NO_SVE-NEXT: ushll2 v2.8h, v7.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #192] +; NO_SVE-NEXT: ushll v1.8h, v7.8b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #224] +; NO_SVE-NEXT: ldr x19, [sp, #32] // 8-byte Folded Reload +; NO_SVE-NEXT: add sp, sp, #48 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB43_245: // %cond.load340 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB43_231 +; NO_SVE-NEXT: .LBB43_246: // %cond.load343 +; NO_SVE-NEXT: add x10, x0, #115 +; NO_SVE-NEXT: ld1 { v7.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB43_232 +; NO_SVE-NEXT: .LBB43_247: // %cond.load346 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB43_233 +; NO_SVE-NEXT: .LBB43_248: // %cond.load349 +; NO_SVE-NEXT: add x10, x0, #117 +; NO_SVE-NEXT: ld1 { v7.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB43_234 +; NO_SVE-NEXT: .LBB43_249: // %cond.load352 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB43_235 +; NO_SVE-NEXT: .LBB43_250: // %cond.load355 +; NO_SVE-NEXT: add x10, x0, #119 +; NO_SVE-NEXT: ld1 { v7.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB43_236 +; NO_SVE-NEXT: .LBB43_251: // %cond.load358 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB43_237 +; NO_SVE-NEXT: .LBB43_252: // %cond.load361 +; NO_SVE-NEXT: add x10, x0, #121 +; NO_SVE-NEXT: ld1 { v7.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB43_238 +; NO_SVE-NEXT: .LBB43_253: // %cond.load364 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB43_239 +; NO_SVE-NEXT: .LBB43_254: // %cond.load367 +; NO_SVE-NEXT: add x10, x0, #123 +; NO_SVE-NEXT: ld1 { v7.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB43_240 +; NO_SVE-NEXT: .LBB43_255: // %cond.load370 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB43_241 +; NO_SVE-NEXT: .LBB43_256: // %cond.load373 +; NO_SVE-NEXT: add x10, x0, #125 +; NO_SVE-NEXT: ld1 { v7.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB43_242 +; NO_SVE-NEXT: .LBB43_257: // %cond.load376 +; NO_SVE-NEXT: add x10, x0, #126 +; NO_SVE-NEXT: ld1 { v7.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB43_243 +; NO_SVE-NEXT: b .LBB43_244 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -759,6 +11679,625 @@ } define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v64i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: bfi w12, w13, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w12, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: bfi w12, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w12, w12, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v0.b[10] +; NO_SVE-NEXT: orr w11, w11, w17, lsl #13 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[13] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB44_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB44_3 +; NO_SVE-NEXT: b .LBB44_4 +; NO_SVE-NEXT: .LBB44_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB44_4 +; NO_SVE-NEXT: .LBB44_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB44_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB44_21 +; NO_SVE-NEXT: .LBB44_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB44_22 +; NO_SVE-NEXT: .LBB44_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB44_23 +; NO_SVE-NEXT: .LBB44_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB44_24 +; NO_SVE-NEXT: .LBB44_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB44_25 +; NO_SVE-NEXT: .LBB44_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB44_26 +; NO_SVE-NEXT: .LBB44_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB44_27 +; NO_SVE-NEXT: .LBB44_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB44_28 +; NO_SVE-NEXT: .LBB44_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB44_29 +; NO_SVE-NEXT: .LBB44_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB44_30 +; NO_SVE-NEXT: .LBB44_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB44_31 +; NO_SVE-NEXT: .LBB44_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB44_32 +; NO_SVE-NEXT: .LBB44_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB44_33 +; NO_SVE-NEXT: .LBB44_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB44_34 +; NO_SVE-NEXT: .LBB44_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB44_35 +; NO_SVE-NEXT: b .LBB44_36 +; NO_SVE-NEXT: .LBB44_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB44_6 +; NO_SVE-NEXT: .LBB44_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB44_7 +; NO_SVE-NEXT: .LBB44_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB44_8 +; NO_SVE-NEXT: .LBB44_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB44_9 +; NO_SVE-NEXT: .LBB44_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB44_10 +; NO_SVE-NEXT: .LBB44_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB44_11 +; NO_SVE-NEXT: .LBB44_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB44_12 +; NO_SVE-NEXT: .LBB44_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB44_13 +; NO_SVE-NEXT: .LBB44_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB44_14 +; NO_SVE-NEXT: .LBB44_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB44_15 +; NO_SVE-NEXT: .LBB44_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB44_16 +; NO_SVE-NEXT: .LBB44_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB44_17 +; NO_SVE-NEXT: .LBB44_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB44_18 +; NO_SVE-NEXT: .LBB44_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB44_19 +; NO_SVE-NEXT: .LBB44_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB44_36 +; NO_SVE-NEXT: .LBB44_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB44_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB44_53 +; NO_SVE-NEXT: .LBB44_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB44_54 +; NO_SVE-NEXT: .LBB44_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB44_55 +; NO_SVE-NEXT: .LBB44_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB44_56 +; NO_SVE-NEXT: .LBB44_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB44_57 +; NO_SVE-NEXT: .LBB44_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB44_58 +; NO_SVE-NEXT: .LBB44_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB44_59 +; NO_SVE-NEXT: .LBB44_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB44_60 +; NO_SVE-NEXT: .LBB44_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB44_61 +; NO_SVE-NEXT: .LBB44_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB44_62 +; NO_SVE-NEXT: .LBB44_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB44_63 +; NO_SVE-NEXT: .LBB44_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB44_64 +; NO_SVE-NEXT: .LBB44_49: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB44_65 +; NO_SVE-NEXT: .LBB44_50: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB44_66 +; NO_SVE-NEXT: .LBB44_51: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB44_67 +; NO_SVE-NEXT: b .LBB44_68 +; NO_SVE-NEXT: .LBB44_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB44_38 +; NO_SVE-NEXT: .LBB44_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB44_39 +; NO_SVE-NEXT: .LBB44_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB44_40 +; NO_SVE-NEXT: .LBB44_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB44_41 +; NO_SVE-NEXT: .LBB44_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB44_42 +; NO_SVE-NEXT: .LBB44_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB44_43 +; NO_SVE-NEXT: .LBB44_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB44_44 +; NO_SVE-NEXT: .LBB44_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB44_45 +; NO_SVE-NEXT: .LBB44_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB44_46 +; NO_SVE-NEXT: .LBB44_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB44_47 +; NO_SVE-NEXT: .LBB44_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB44_48 +; NO_SVE-NEXT: .LBB44_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB44_49 +; NO_SVE-NEXT: .LBB44_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB44_50 +; NO_SVE-NEXT: .LBB44_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB44_51 +; NO_SVE-NEXT: .LBB44_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x9, #33, .LBB44_68 +; NO_SVE-NEXT: .LBB44_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB44_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB44_85 +; NO_SVE-NEXT: .LBB44_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB44_86 +; NO_SVE-NEXT: .LBB44_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB44_87 +; NO_SVE-NEXT: .LBB44_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB44_88 +; NO_SVE-NEXT: .LBB44_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB44_89 +; NO_SVE-NEXT: .LBB44_74: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB44_90 +; NO_SVE-NEXT: .LBB44_75: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB44_91 +; NO_SVE-NEXT: .LBB44_76: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB44_92 +; NO_SVE-NEXT: .LBB44_77: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB44_93 +; NO_SVE-NEXT: .LBB44_78: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB44_94 +; NO_SVE-NEXT: .LBB44_79: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB44_95 +; NO_SVE-NEXT: .LBB44_80: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB44_96 +; NO_SVE-NEXT: .LBB44_81: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB44_97 +; NO_SVE-NEXT: .LBB44_82: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB44_98 +; NO_SVE-NEXT: .LBB44_83: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB44_99 +; NO_SVE-NEXT: b .LBB44_100 +; NO_SVE-NEXT: .LBB44_84: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB44_70 +; NO_SVE-NEXT: .LBB44_85: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB44_71 +; NO_SVE-NEXT: .LBB44_86: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB44_72 +; NO_SVE-NEXT: .LBB44_87: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB44_73 +; NO_SVE-NEXT: .LBB44_88: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB44_74 +; NO_SVE-NEXT: .LBB44_89: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB44_75 +; NO_SVE-NEXT: .LBB44_90: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB44_76 +; NO_SVE-NEXT: .LBB44_91: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB44_77 +; NO_SVE-NEXT: .LBB44_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB44_78 +; NO_SVE-NEXT: .LBB44_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB44_79 +; NO_SVE-NEXT: .LBB44_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB44_80 +; NO_SVE-NEXT: .LBB44_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB44_81 +; NO_SVE-NEXT: .LBB44_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB44_82 +; NO_SVE-NEXT: .LBB44_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB44_83 +; NO_SVE-NEXT: .LBB44_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x9, #49, .LBB44_100 +; NO_SVE-NEXT: .LBB44_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB44_116 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB44_117 +; NO_SVE-NEXT: .LBB44_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB44_118 +; NO_SVE-NEXT: .LBB44_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB44_119 +; NO_SVE-NEXT: .LBB44_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB44_120 +; NO_SVE-NEXT: .LBB44_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB44_121 +; NO_SVE-NEXT: .LBB44_106: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB44_122 +; NO_SVE-NEXT: .LBB44_107: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB44_123 +; NO_SVE-NEXT: .LBB44_108: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB44_124 +; NO_SVE-NEXT: .LBB44_109: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB44_125 +; NO_SVE-NEXT: .LBB44_110: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB44_126 +; NO_SVE-NEXT: .LBB44_111: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB44_127 +; NO_SVE-NEXT: .LBB44_112: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB44_128 +; NO_SVE-NEXT: .LBB44_113: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB44_115 +; NO_SVE-NEXT: .LBB44_114: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x9] +; NO_SVE-NEXT: .LBB44_115: // %else188 +; NO_SVE-NEXT: ushll v6.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll2 v5.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v7.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: ushll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: ushll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: ushll v5.4s, v5.4h, #0 +; NO_SVE-NEXT: ushll2 v4.8h, v3.16b, #0 +; NO_SVE-NEXT: ushll2 v6.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: ushll v0.4s, v4.4h, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll v2.8h, v3.8b, #0 +; NO_SVE-NEXT: ushll2 v17.4s, v7.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll v7.4s, v7.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB44_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB44_102 +; NO_SVE-NEXT: .LBB44_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB44_103 +; NO_SVE-NEXT: .LBB44_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB44_104 +; NO_SVE-NEXT: .LBB44_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB44_105 +; NO_SVE-NEXT: .LBB44_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB44_106 +; NO_SVE-NEXT: .LBB44_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB44_107 +; NO_SVE-NEXT: .LBB44_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB44_108 +; NO_SVE-NEXT: .LBB44_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB44_109 +; NO_SVE-NEXT: .LBB44_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB44_110 +; NO_SVE-NEXT: .LBB44_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB44_111 +; NO_SVE-NEXT: .LBB44_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB44_112 +; NO_SVE-NEXT: .LBB44_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB44_113 +; NO_SVE-NEXT: .LBB44_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB44_114 +; NO_SVE-NEXT: b .LBB44_115 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -775,6 +12314,338 @@ } define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: umov w9, v0.b[8] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w11, v0.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[12] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w16, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w16, w10, #4, #1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: umov w10, v0.b[14] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: orr w10, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[12] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[14] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v0.b[15] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB45_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB45_3 +; NO_SVE-NEXT: b .LBB45_4 +; NO_SVE-NEXT: .LBB45_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB45_4 +; NO_SVE-NEXT: .LBB45_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB45_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB45_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB45_21 +; NO_SVE-NEXT: .LBB45_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB45_22 +; NO_SVE-NEXT: .LBB45_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB45_23 +; NO_SVE-NEXT: .LBB45_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB45_24 +; NO_SVE-NEXT: .LBB45_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB45_25 +; NO_SVE-NEXT: .LBB45_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB45_26 +; NO_SVE-NEXT: .LBB45_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB45_27 +; NO_SVE-NEXT: .LBB45_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB45_28 +; NO_SVE-NEXT: .LBB45_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB45_29 +; NO_SVE-NEXT: .LBB45_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB45_30 +; NO_SVE-NEXT: .LBB45_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB45_31 +; NO_SVE-NEXT: .LBB45_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB45_32 +; NO_SVE-NEXT: .LBB45_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB45_33 +; NO_SVE-NEXT: .LBB45_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB45_34 +; NO_SVE-NEXT: .LBB45_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB45_35 +; NO_SVE-NEXT: b .LBB45_36 +; NO_SVE-NEXT: .LBB45_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB45_6 +; NO_SVE-NEXT: .LBB45_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB45_7 +; NO_SVE-NEXT: .LBB45_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB45_8 +; NO_SVE-NEXT: .LBB45_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB45_9 +; NO_SVE-NEXT: .LBB45_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB45_10 +; NO_SVE-NEXT: .LBB45_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB45_11 +; NO_SVE-NEXT: .LBB45_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB45_12 +; NO_SVE-NEXT: .LBB45_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB45_13 +; NO_SVE-NEXT: .LBB45_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB45_14 +; NO_SVE-NEXT: .LBB45_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB45_15 +; NO_SVE-NEXT: .LBB45_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB45_16 +; NO_SVE-NEXT: .LBB45_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB45_17 +; NO_SVE-NEXT: .LBB45_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB45_18 +; NO_SVE-NEXT: .LBB45_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB45_19 +; NO_SVE-NEXT: .LBB45_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB45_36 +; NO_SVE-NEXT: .LBB45_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB45_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB45_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB45_53 +; NO_SVE-NEXT: .LBB45_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB45_54 +; NO_SVE-NEXT: .LBB45_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB45_55 +; NO_SVE-NEXT: .LBB45_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB45_56 +; NO_SVE-NEXT: .LBB45_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB45_57 +; NO_SVE-NEXT: .LBB45_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB45_58 +; NO_SVE-NEXT: .LBB45_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB45_59 +; NO_SVE-NEXT: .LBB45_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB45_60 +; NO_SVE-NEXT: .LBB45_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB45_61 +; NO_SVE-NEXT: .LBB45_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB45_62 +; NO_SVE-NEXT: .LBB45_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB45_63 +; NO_SVE-NEXT: .LBB45_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB45_64 +; NO_SVE-NEXT: .LBB45_49: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB45_51 +; NO_SVE-NEXT: .LBB45_50: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: .LBB45_51: // %else92 +; NO_SVE-NEXT: ushll v3.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #32] +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #224] +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB45_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB45_38 +; NO_SVE-NEXT: .LBB45_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB45_39 +; NO_SVE-NEXT: .LBB45_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB45_40 +; NO_SVE-NEXT: .LBB45_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB45_41 +; NO_SVE-NEXT: .LBB45_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB45_42 +; NO_SVE-NEXT: .LBB45_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB45_43 +; NO_SVE-NEXT: .LBB45_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB45_44 +; NO_SVE-NEXT: .LBB45_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB45_45 +; NO_SVE-NEXT: .LBB45_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB45_46 +; NO_SVE-NEXT: .LBB45_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB45_47 +; NO_SVE-NEXT: .LBB45_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB45_48 +; NO_SVE-NEXT: .LBB45_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB45_49 +; NO_SVE-NEXT: .LBB45_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB45_50 +; NO_SVE-NEXT: b .LBB45_51 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -791,6 +12662,635 @@ } define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x1, #96] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: umov w10, v3.b[0] +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: xtn v6.8b, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: umov w17, v6.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v6.b[2] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w11, v6.b[3] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: cmeq v2.8h, v5.8h, #0 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: umov w13, v6.b[5] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v2.8h +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: umov w14, v6.b[6] +; NO_SVE-NEXT: umov w15, v5.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: bfi w10, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: cmeq v2.8h, v4.8h, #0 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v5.b[7] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: ldp q7, q3, [x1, #32] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #6 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v2.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: cmeq v4.8h, v7.8h, #0 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #9 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[1] +; NO_SVE-NEXT: umov w15, v4.b[0] +; NO_SVE-NEXT: umov w17, v4.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[4] +; NO_SVE-NEXT: bfi w12, w11, #1, #1 +; NO_SVE-NEXT: umov w11, v4.b[5] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: umov w17, v4.b[7] +; NO_SVE-NEXT: bfi w12, w13, #2, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[6] +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: umov w13, v3.b[0] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: bfi w12, w11, #5, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #6 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #7 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: umov w17, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w18, v1.b[4] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #11 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[3] +; NO_SVE-NEXT: umov w1, v1.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w15, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w15, w18, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w16, w1, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #13 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: umov w14, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w12, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: bfi w12, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #7 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #14 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #8 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v3.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB46_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB46_3 +; NO_SVE-NEXT: b .LBB46_4 +; NO_SVE-NEXT: .LBB46_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB46_4 +; NO_SVE-NEXT: .LBB46_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB46_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB46_13 +; NO_SVE-NEXT: .LBB46_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB46_14 +; NO_SVE-NEXT: .LBB46_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB46_15 +; NO_SVE-NEXT: .LBB46_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB46_16 +; NO_SVE-NEXT: .LBB46_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB46_17 +; NO_SVE-NEXT: .LBB46_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB46_18 +; NO_SVE-NEXT: .LBB46_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB46_19 +; NO_SVE-NEXT: b .LBB46_20 +; NO_SVE-NEXT: .LBB46_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB46_6 +; NO_SVE-NEXT: .LBB46_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB46_7 +; NO_SVE-NEXT: .LBB46_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB46_8 +; NO_SVE-NEXT: .LBB46_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB46_9 +; NO_SVE-NEXT: .LBB46_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB46_10 +; NO_SVE-NEXT: .LBB46_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB46_11 +; NO_SVE-NEXT: .LBB46_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB46_20 +; NO_SVE-NEXT: .LBB46_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB46_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB46_29 +; NO_SVE-NEXT: .LBB46_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB46_30 +; NO_SVE-NEXT: .LBB46_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB46_31 +; NO_SVE-NEXT: .LBB46_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB46_32 +; NO_SVE-NEXT: .LBB46_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB46_33 +; NO_SVE-NEXT: .LBB46_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB46_34 +; NO_SVE-NEXT: .LBB46_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB46_35 +; NO_SVE-NEXT: b .LBB46_36 +; NO_SVE-NEXT: .LBB46_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB46_22 +; NO_SVE-NEXT: .LBB46_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB46_23 +; NO_SVE-NEXT: .LBB46_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB46_24 +; NO_SVE-NEXT: .LBB46_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB46_25 +; NO_SVE-NEXT: .LBB46_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB46_26 +; NO_SVE-NEXT: .LBB46_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB46_27 +; NO_SVE-NEXT: .LBB46_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB46_36 +; NO_SVE-NEXT: .LBB46_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB46_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB46_45 +; NO_SVE-NEXT: .LBB46_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB46_46 +; NO_SVE-NEXT: .LBB46_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB46_47 +; NO_SVE-NEXT: .LBB46_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB46_48 +; NO_SVE-NEXT: .LBB46_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB46_49 +; NO_SVE-NEXT: .LBB46_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB46_50 +; NO_SVE-NEXT: .LBB46_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB46_51 +; NO_SVE-NEXT: b .LBB46_52 +; NO_SVE-NEXT: .LBB46_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB46_38 +; NO_SVE-NEXT: .LBB46_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB46_39 +; NO_SVE-NEXT: .LBB46_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB46_40 +; NO_SVE-NEXT: .LBB46_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB46_41 +; NO_SVE-NEXT: .LBB46_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB46_42 +; NO_SVE-NEXT: .LBB46_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB46_43 +; NO_SVE-NEXT: .LBB46_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB46_52 +; NO_SVE-NEXT: .LBB46_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB46_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB46_61 +; NO_SVE-NEXT: .LBB46_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB46_62 +; NO_SVE-NEXT: .LBB46_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB46_63 +; NO_SVE-NEXT: .LBB46_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB46_64 +; NO_SVE-NEXT: .LBB46_57: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB46_65 +; NO_SVE-NEXT: .LBB46_58: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB46_66 +; NO_SVE-NEXT: .LBB46_59: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB46_67 +; NO_SVE-NEXT: b .LBB46_68 +; NO_SVE-NEXT: .LBB46_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB46_54 +; NO_SVE-NEXT: .LBB46_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB46_55 +; NO_SVE-NEXT: .LBB46_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB46_56 +; NO_SVE-NEXT: .LBB46_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB46_57 +; NO_SVE-NEXT: .LBB46_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB46_58 +; NO_SVE-NEXT: .LBB46_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB46_59 +; NO_SVE-NEXT: .LBB46_66: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz x9, #33, .LBB46_68 +; NO_SVE-NEXT: .LBB46_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB46_76 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB46_77 +; NO_SVE-NEXT: .LBB46_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB46_78 +; NO_SVE-NEXT: .LBB46_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB46_79 +; NO_SVE-NEXT: .LBB46_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB46_80 +; NO_SVE-NEXT: .LBB46_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB46_81 +; NO_SVE-NEXT: .LBB46_74: // %else116 +; NO_SVE-NEXT: tbz x9, #40, .LBB46_82 +; NO_SVE-NEXT: .LBB46_75: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #41, .LBB46_83 +; NO_SVE-NEXT: b .LBB46_84 +; NO_SVE-NEXT: .LBB46_76: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB46_70 +; NO_SVE-NEXT: .LBB46_77: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB46_71 +; NO_SVE-NEXT: .LBB46_78: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB46_72 +; NO_SVE-NEXT: .LBB46_79: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB46_73 +; NO_SVE-NEXT: .LBB46_80: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB46_74 +; NO_SVE-NEXT: .LBB46_81: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #40, .LBB46_75 +; NO_SVE-NEXT: .LBB46_82: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz x9, #41, .LBB46_84 +; NO_SVE-NEXT: .LBB46_83: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_84: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB46_92 +; NO_SVE-NEXT: // %bb.85: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB46_93 +; NO_SVE-NEXT: .LBB46_86: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB46_94 +; NO_SVE-NEXT: .LBB46_87: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB46_95 +; NO_SVE-NEXT: .LBB46_88: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB46_96 +; NO_SVE-NEXT: .LBB46_89: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB46_97 +; NO_SVE-NEXT: .LBB46_90: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB46_98 +; NO_SVE-NEXT: .LBB46_91: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB46_99 +; NO_SVE-NEXT: b .LBB46_100 +; NO_SVE-NEXT: .LBB46_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB46_86 +; NO_SVE-NEXT: .LBB46_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB46_87 +; NO_SVE-NEXT: .LBB46_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB46_88 +; NO_SVE-NEXT: .LBB46_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB46_89 +; NO_SVE-NEXT: .LBB46_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB46_90 +; NO_SVE-NEXT: .LBB46_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB46_91 +; NO_SVE-NEXT: .LBB46_98: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #49, .LBB46_100 +; NO_SVE-NEXT: .LBB46_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB46_108 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB46_109 +; NO_SVE-NEXT: .LBB46_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB46_110 +; NO_SVE-NEXT: .LBB46_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB46_111 +; NO_SVE-NEXT: .LBB46_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB46_112 +; NO_SVE-NEXT: .LBB46_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB46_113 +; NO_SVE-NEXT: .LBB46_106: // %else164 +; NO_SVE-NEXT: tbz x9, #56, .LBB46_114 +; NO_SVE-NEXT: .LBB46_107: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #57, .LBB46_115 +; NO_SVE-NEXT: b .LBB46_116 +; NO_SVE-NEXT: .LBB46_108: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB46_102 +; NO_SVE-NEXT: .LBB46_109: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB46_103 +; NO_SVE-NEXT: .LBB46_110: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB46_104 +; NO_SVE-NEXT: .LBB46_111: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB46_105 +; NO_SVE-NEXT: .LBB46_112: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB46_106 +; NO_SVE-NEXT: .LBB46_113: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #56, .LBB46_107 +; NO_SVE-NEXT: .LBB46_114: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #57, .LBB46_116 +; NO_SVE-NEXT: .LBB46_115: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_116: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB46_124 +; NO_SVE-NEXT: // %bb.117: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB46_125 +; NO_SVE-NEXT: .LBB46_118: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB46_126 +; NO_SVE-NEXT: .LBB46_119: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB46_127 +; NO_SVE-NEXT: .LBB46_120: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB46_128 +; NO_SVE-NEXT: .LBB46_121: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB46_123 +; NO_SVE-NEXT: .LBB46_122: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #126 +; NO_SVE-NEXT: ld1 { v7.h }[7], [x9] +; NO_SVE-NEXT: .LBB46_123: // %else188 +; NO_SVE-NEXT: ushll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll2 v16.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.4s, v3.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.4s, v4.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v5.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v6.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: ushll2 v0.4s, v7.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v7.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB46_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB46_118 +; NO_SVE-NEXT: .LBB46_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB46_119 +; NO_SVE-NEXT: .LBB46_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB46_120 +; NO_SVE-NEXT: .LBB46_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB46_121 +; NO_SVE-NEXT: .LBB46_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.h }[6], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB46_122 +; NO_SVE-NEXT: b .LBB46_123 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -807,6 +13307,343 @@ } define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w17, v1.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: umov w17, v2.b[0] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v2.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #12 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: bfi w16, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w16, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[6] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w12, w10, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB47_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB47_3 +; NO_SVE-NEXT: b .LBB47_4 +; NO_SVE-NEXT: .LBB47_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB47_4 +; NO_SVE-NEXT: .LBB47_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB47_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB47_13 +; NO_SVE-NEXT: .LBB47_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB47_14 +; NO_SVE-NEXT: .LBB47_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB47_15 +; NO_SVE-NEXT: .LBB47_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB47_16 +; NO_SVE-NEXT: .LBB47_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB47_17 +; NO_SVE-NEXT: .LBB47_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB47_18 +; NO_SVE-NEXT: .LBB47_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB47_19 +; NO_SVE-NEXT: b .LBB47_20 +; NO_SVE-NEXT: .LBB47_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB47_6 +; NO_SVE-NEXT: .LBB47_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB47_7 +; NO_SVE-NEXT: .LBB47_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB47_8 +; NO_SVE-NEXT: .LBB47_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB47_9 +; NO_SVE-NEXT: .LBB47_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB47_10 +; NO_SVE-NEXT: .LBB47_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB47_11 +; NO_SVE-NEXT: .LBB47_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB47_20 +; NO_SVE-NEXT: .LBB47_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB47_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB47_29 +; NO_SVE-NEXT: .LBB47_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB47_30 +; NO_SVE-NEXT: .LBB47_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB47_31 +; NO_SVE-NEXT: .LBB47_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB47_32 +; NO_SVE-NEXT: .LBB47_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB47_33 +; NO_SVE-NEXT: .LBB47_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB47_34 +; NO_SVE-NEXT: .LBB47_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB47_35 +; NO_SVE-NEXT: b .LBB47_36 +; NO_SVE-NEXT: .LBB47_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB47_22 +; NO_SVE-NEXT: .LBB47_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB47_23 +; NO_SVE-NEXT: .LBB47_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB47_24 +; NO_SVE-NEXT: .LBB47_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB47_25 +; NO_SVE-NEXT: .LBB47_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB47_26 +; NO_SVE-NEXT: .LBB47_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB47_27 +; NO_SVE-NEXT: .LBB47_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB47_36 +; NO_SVE-NEXT: .LBB47_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB47_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB47_45 +; NO_SVE-NEXT: .LBB47_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB47_46 +; NO_SVE-NEXT: .LBB47_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB47_47 +; NO_SVE-NEXT: .LBB47_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB47_48 +; NO_SVE-NEXT: .LBB47_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB47_49 +; NO_SVE-NEXT: .LBB47_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB47_50 +; NO_SVE-NEXT: .LBB47_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB47_51 +; NO_SVE-NEXT: b .LBB47_52 +; NO_SVE-NEXT: .LBB47_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB47_38 +; NO_SVE-NEXT: .LBB47_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB47_39 +; NO_SVE-NEXT: .LBB47_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB47_40 +; NO_SVE-NEXT: .LBB47_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB47_41 +; NO_SVE-NEXT: .LBB47_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB47_42 +; NO_SVE-NEXT: .LBB47_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB47_43 +; NO_SVE-NEXT: .LBB47_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB47_52 +; NO_SVE-NEXT: .LBB47_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB47_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB47_61 +; NO_SVE-NEXT: .LBB47_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB47_62 +; NO_SVE-NEXT: .LBB47_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB47_63 +; NO_SVE-NEXT: .LBB47_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB47_64 +; NO_SVE-NEXT: .LBB47_57: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB47_59 +; NO_SVE-NEXT: .LBB47_58: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: .LBB47_59: // %else92 +; NO_SVE-NEXT: ushll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v3.8h, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v3.4h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB47_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB47_54 +; NO_SVE-NEXT: .LBB47_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB47_55 +; NO_SVE-NEXT: .LBB47_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB47_56 +; NO_SVE-NEXT: .LBB47_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB47_57 +; NO_SVE-NEXT: .LBB47_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB47_58 +; NO_SVE-NEXT: b .LBB47_59 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -823,6 +13660,349 @@ } define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x1, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB48_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB48_3 +; NO_SVE-NEXT: b .LBB48_4 +; NO_SVE-NEXT: .LBB48_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB48_4 +; NO_SVE-NEXT: .LBB48_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB48_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB48_9 +; NO_SVE-NEXT: .LBB48_6: // %else8 +; NO_SVE-NEXT: tbz w9, #4, .LBB48_10 +; NO_SVE-NEXT: .LBB48_7: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #5, .LBB48_11 +; NO_SVE-NEXT: b .LBB48_12 +; NO_SVE-NEXT: .LBB48_8: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB48_6 +; NO_SVE-NEXT: .LBB48_9: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #4, .LBB48_7 +; NO_SVE-NEXT: .LBB48_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #5, .LBB48_12 +; NO_SVE-NEXT: .LBB48_11: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_12: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB48_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB48_17 +; NO_SVE-NEXT: .LBB48_14: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB48_18 +; NO_SVE-NEXT: .LBB48_15: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB48_19 +; NO_SVE-NEXT: b .LBB48_20 +; NO_SVE-NEXT: .LBB48_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB48_14 +; NO_SVE-NEXT: .LBB48_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB48_15 +; NO_SVE-NEXT: .LBB48_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #9, .LBB48_20 +; NO_SVE-NEXT: .LBB48_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB48_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB48_25 +; NO_SVE-NEXT: .LBB48_22: // %else32 +; NO_SVE-NEXT: tbz w9, #12, .LBB48_26 +; NO_SVE-NEXT: .LBB48_23: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #13, .LBB48_27 +; NO_SVE-NEXT: b .LBB48_28 +; NO_SVE-NEXT: .LBB48_24: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB48_22 +; NO_SVE-NEXT: .LBB48_25: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #12, .LBB48_23 +; NO_SVE-NEXT: .LBB48_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #13, .LBB48_28 +; NO_SVE-NEXT: .LBB48_27: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_28: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB48_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB48_33 +; NO_SVE-NEXT: .LBB48_30: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB48_34 +; NO_SVE-NEXT: .LBB48_31: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB48_35 +; NO_SVE-NEXT: b .LBB48_36 +; NO_SVE-NEXT: .LBB48_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB48_30 +; NO_SVE-NEXT: .LBB48_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB48_31 +; NO_SVE-NEXT: .LBB48_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #17, .LBB48_36 +; NO_SVE-NEXT: .LBB48_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB48_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB48_41 +; NO_SVE-NEXT: .LBB48_38: // %else56 +; NO_SVE-NEXT: tbz w9, #20, .LBB48_42 +; NO_SVE-NEXT: .LBB48_39: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #21, .LBB48_43 +; NO_SVE-NEXT: b .LBB48_44 +; NO_SVE-NEXT: .LBB48_40: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB48_38 +; NO_SVE-NEXT: .LBB48_41: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #20, .LBB48_39 +; NO_SVE-NEXT: .LBB48_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #21, .LBB48_44 +; NO_SVE-NEXT: .LBB48_43: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_44: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB48_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB48_49 +; NO_SVE-NEXT: .LBB48_46: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB48_50 +; NO_SVE-NEXT: .LBB48_47: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB48_51 +; NO_SVE-NEXT: b .LBB48_52 +; NO_SVE-NEXT: .LBB48_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB48_46 +; NO_SVE-NEXT: .LBB48_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB48_47 +; NO_SVE-NEXT: .LBB48_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w9, #25, .LBB48_52 +; NO_SVE-NEXT: .LBB48_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB48_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB48_57 +; NO_SVE-NEXT: .LBB48_54: // %else80 +; NO_SVE-NEXT: tbz w9, #28, .LBB48_58 +; NO_SVE-NEXT: .LBB48_55: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #29, .LBB48_59 +; NO_SVE-NEXT: b .LBB48_60 +; NO_SVE-NEXT: .LBB48_56: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB48_54 +; NO_SVE-NEXT: .LBB48_57: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #28, .LBB48_55 +; NO_SVE-NEXT: .LBB48_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w9, #29, .LBB48_60 +; NO_SVE-NEXT: .LBB48_59: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_60: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB48_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB48_63 +; NO_SVE-NEXT: .LBB48_62: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB48_63: // %else92 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.2d, v4.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v5.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: ushll2 v0.2d, v7.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v7.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB48_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB48_62 +; NO_SVE-NEXT: b .LBB48_63 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -839,13 +14019,102 @@ } define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_ugt_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmtst v1.4s, v1.4s, v1.4s +; NO_SVE-NEXT: cmtst v0.4s, v0.4s, v0.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB49_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB49_3 +; NO_SVE-NEXT: b .LBB49_4 +; NO_SVE-NEXT: .LBB49_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB49_4 +; NO_SVE-NEXT: .LBB49_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB49_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB49_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB49_9 +; NO_SVE-NEXT: .LBB49_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB49_10 +; NO_SVE-NEXT: .LBB49_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB49_11 +; NO_SVE-NEXT: b .LBB49_12 +; NO_SVE-NEXT: .LBB49_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB49_6 +; NO_SVE-NEXT: .LBB49_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB49_7 +; NO_SVE-NEXT: .LBB49_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB49_12 +; NO_SVE-NEXT: .LBB49_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB49_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB49_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB49_15 +; NO_SVE-NEXT: .LBB49_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB49_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB49_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB49_14 +; NO_SVE-NEXT: b .LBB49_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp ugt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) @@ -854,13 +14123,102 @@ } define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_sgt_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmgt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmgt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB50_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB50_3 +; NO_SVE-NEXT: b .LBB50_4 +; NO_SVE-NEXT: .LBB50_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB50_4 +; NO_SVE-NEXT: .LBB50_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB50_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB50_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB50_9 +; NO_SVE-NEXT: .LBB50_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB50_10 +; NO_SVE-NEXT: .LBB50_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB50_11 +; NO_SVE-NEXT: b .LBB50_12 +; NO_SVE-NEXT: .LBB50_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB50_6 +; NO_SVE-NEXT: .LBB50_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB50_7 +; NO_SVE-NEXT: .LBB50_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB50_12 +; NO_SVE-NEXT: .LBB50_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB50_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB50_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB50_15 +; NO_SVE-NEXT: .LBB50_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB50_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB50_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB50_14 +; NO_SVE-NEXT: b .LBB50_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp sgt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -268,10 +268,9 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm @@ -282,12 +281,11 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -23,8 +23,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -40,8 +40,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -85,10 +85,10 @@ } ; GCN-LABEL: name: divergent_vec_i16_LL -; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 -; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]], %1, implicit $exec ; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %0, killed %[[IMM]], implicit $exec +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]], %1, implicit $exec ; GCN: V_OR_B32_e64 killed %[[AND]], killed %[[SHL]], implicit $exec ; GFX9-LABEL: name: divergent_vec_i16_LL diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -209,27 +209,27 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 -; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 -; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 +; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 +; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 +; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -294,19 +294,19 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: @@ -395,22 +395,22 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(9) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(9) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align1: @@ -492,23 +492,23 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -693,30 +693,25 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align1: @@ -814,27 +809,29 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -657,20 +657,20 @@ ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: ds_write_b8 v0, v3 offset:8 ; CI-NEXT: ds_write_b8 v0, v4 offset:7 ; CI-NEXT: ds_write_b8 v0, v5 offset:6 -; CI-NEXT: ds_write_b8 v0, v1 offset:16 -; CI-NEXT: ds_write_b8 v0, v6 offset:15 -; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: ds_write_b8 v0, v3 offset:12 ; CI-NEXT: ds_write_b8 v0, v4 offset:11 ; CI-NEXT: ds_write_b8 v0, v5 offset:10 +; CI-NEXT: ds_write_b8 v0, v1 offset:16 +; CI-NEXT: ds_write_b8 v0, v6 offset:15 +; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: @@ -686,18 +686,18 @@ ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 +; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -30,22 +30,22 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) { ; CHECK-LABEL: test_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 28, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v4, v2 -; CHECK-NEXT: ds_read_b32 v3, v3 -; CHECK-NEXT: ds_read_b32 v2, v6 +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v5, v4 +; CHECK-NEXT: ds_read_b32 v4, v6 ; CHECK-NEXT: ds_read_b32 v9, v7 ; CHECK-NEXT: ds_read_b32 v8, v8 ; CHECK-NEXT: ds_read_b32 v7, v10 ; CHECK-NEXT: ds_read_b32 v6, v1 -; CHECK-NEXT: ds_read_b32 v5, v5 +; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll b/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll --- a/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll +++ b/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll @@ -6,42 +6,42 @@ ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: CALL_FS ; CHECK-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[] -; CHECK-NEXT: EXPORT T0.XYZW +; CHECK-NEXT: EXPORT T2.XYZW ; CHECK-NEXT: EXPORT T0.0000 ; CHECK-NEXT: EXPORT T0.0000 -; CHECK-NEXT: EXPORT T4.0YZW -; CHECK-NEXT: EXPORT T3.XYZW -; CHECK-NEXT: EXPORT T2.XY00 +; CHECK-NEXT: EXPORT T3.0YZW +; CHECK-NEXT: EXPORT T0.WYZX +; CHECK-NEXT: EXPORT T1.XY00 ; CHECK-NEXT: EXPORT T0.0000 ; CHECK-NEXT: EXPORT T0.0000 ; CHECK-NEXT: CF_END ; CHECK-NEXT: PAD ; CHECK-NEXT: ALU clause starting at 12: ; CHECK-NEXT: MUL_IEEE * T0.W, KC0[4].X, T1.X, -; CHECK-NEXT: MULADD_IEEE T0.W, KC0[5].X, T1.Y, PV.W, -; CHECK-NEXT: MUL_IEEE * T2.W, KC0[4].Y, T1.X, +; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[5].X, T1.Y, PV.W, +; CHECK-NEXT: MOV T0.X, KC0[2].X, ; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[6].X, T1.Z, PV.W, -; CHECK-NEXT: MULADD_IEEE T0.X, KC0[7].X, T1.W, PV.W, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[5].Y, T1.Y, T2.W, -; CHECK-NEXT: MUL_IEEE * T2.W, KC0[4].Z, T1.X, -; CHECK-NEXT: MOV T2.Y, KC0[2].Z, -; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[5].Z, T1.Y, PV.W, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[6].Y, T1.Z, T0.W, -; CHECK-NEXT: MOV T2.X, KC0[2].Y, -; CHECK-NEXT: MULADD_IEEE * T0.Y, KC0[7].Y, T1.W, PV.W, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[6].Z, T1.Z, T2.W, -; CHECK-NEXT: MULADD_IEEE T0.Z, KC0[7].Z, T1.W, PV.W, -; CHECK-NEXT: MUL_IEEE * T0.W, KC0[4].W, T1.X, -; CHECK-NEXT: MOV * T3.W, KC0[2].X, -; CHECK-NEXT: MOV T3.Z, KC0[3].Z, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[5].W, T1.Y, T0.W, -; CHECK-NEXT: MOV * T4.W, KC0[0].Z, -; CHECK-NEXT: MOV T3.Y, KC0[3].Y, -; CHECK-NEXT: MOV * T4.Z, KC0[0].Y, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[6].W, T1.Z, T0.W, -; CHECK-NEXT: MOV T3.X, KC0[3].X, -; CHECK-NEXT: MOV * T4.Y, KC0[0].X, -; CHECK-NEXT: MULADD_IEEE * T0.W, KC0[7].W, T1.W, T0.W, +; CHECK-NEXT: MULADD_IEEE T2.X, KC0[7].X, T1.W, PV.W, +; CHECK-NEXT: MUL_IEEE * T2.W, KC0[4].Y, T1.X, +; CHECK-NEXT: MOV * T0.W, KC0[3].X, +; CHECK-NEXT: MOV T0.Z, KC0[3].Z, +; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[5].Y, T1.Y, T2.W, +; CHECK-NEXT: MUL_IEEE * T3.W, KC0[4].Z, T1.X, +; CHECK-NEXT: MOV T0.Y, KC0[3].Y, +; CHECK-NEXT: MULADD_IEEE * T3.W, KC0[5].Z, T1.Y, PV.W, +; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[6].Y, T1.Z, T2.W, +; CHECK-NEXT: MULADD_IEEE T2.Y, KC0[7].Y, T1.W, PV.W, +; CHECK-NEXT: MUL_IEEE * T2.W, KC0[4].W, T1.X, +; CHECK-NEXT: MULADD_IEEE * T3.W, KC0[6].Z, T1.Z, T3.W, +; CHECK-NEXT: MULADD_IEEE T2.Z, KC0[7].Z, T1.W, PV.W, +; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[5].W, T1.Y, T2.W, +; CHECK-NEXT: MOV * T3.W, KC0[0].Z, +; CHECK-NEXT: MOV T1.Y, KC0[2].Z, +; CHECK-NEXT: MOV * T3.Z, KC0[0].Y, +; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[6].W, T1.Z, T2.W, +; CHECK-NEXT: MOV T1.X, KC0[2].Y, +; CHECK-NEXT: MOV * T3.Y, KC0[0].X, +; CHECK-NEXT: MULADD_IEEE * T2.W, KC0[7].W, T1.W, T2.W, main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -38,8 +38,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -85,8 +85,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -70,42 +70,42 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b8 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -115,50 +115,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s4, s2, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s4, s2, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: s_lshr_b32 s4, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -168,50 +168,50 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s4, s3, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_lshr_b32 s4, s2, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: s_lshr_b32 s4, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align1: @@ -221,42 +221,42 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s3, s7, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s6, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s2, s7, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_lshr_b32 s3, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: s_lshr_b32 s2, s6, 8 +; GFX10-NEXT: s_lshr_b32 s6, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b8 v0, v3 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:13 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_lshr_b32 s1, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:15 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -269,18 +269,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -290,26 +290,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -319,26 +319,26 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align2: @@ -348,18 +348,18 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b16 v0, v3 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -68,32 +68,32 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -104,12 +104,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 @@ -117,25 +117,25 @@ ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -146,12 +146,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 @@ -159,25 +159,25 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align1: @@ -188,32 +188,32 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s4, 8 -; GFX10-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-NEXT: s_lshr_b32 s4, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s3, s5, 24 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v3 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -227,14 +227,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -245,20 +245,20 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -269,20 +269,20 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align2: @@ -293,14 +293,14 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v3 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -45,19 +45,22 @@ ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: s_and_b32 s3, s0, 0xffff +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s0 ; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 ; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; HAWAII-NEXT: v_or_b32_e32 v0, s3, v0 +; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7 ; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: ds_write_b32 v1, v3 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: diff --git a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll --- a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll +++ b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll @@ -14,9 +14,8 @@ ; ARM-NEXT: adds r0, r1, r0 ; ARM-NEXT: movw r1, #65535 ; ARM-NEXT: sxth r2, r2 -; ARM-NEXT: adc r0, r2, #0 -; ARM-NEXT: uxth r0, r0 -; ARM-NEXT: cmp r0, r1 +; ARM-NEXT: adc r0, r2, #1 +; ARM-NEXT: tst r0, r1 ; ARM-NEXT: bxeq lr ; ARM-NEXT: .LBB0_1: @ %for.cond ; ARM-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -26,33 +25,25 @@ ; THUMBV6M: @ %bb.0: @ %entry ; THUMBV6M-NEXT: rsbs r2, r2, #0 ; THUMBV6M-NEXT: sxth r2, r2 -; THUMBV6M-NEXT: movs r3, #0 +; THUMBV6M-NEXT: movs r3, #1 ; THUMBV6M-NEXT: adds r0, r1, r0 ; THUMBV6M-NEXT: adcs r3, r2 -; THUMBV6M-NEXT: uxth r0, r3 -; THUMBV6M-NEXT: ldr r1, .LCPI0_0 -; THUMBV6M-NEXT: cmp r0, r1 +; THUMBV6M-NEXT: lsls r0, r3, #16 ; THUMBV6M-NEXT: beq .LBB0_2 ; THUMBV6M-NEXT: .LBB0_1: @ %for.cond ; THUMBV6M-NEXT: @ =>This Inner Loop Header: Depth=1 ; THUMBV6M-NEXT: b .LBB0_1 ; THUMBV6M-NEXT: .LBB0_2: @ %if.end ; THUMBV6M-NEXT: bx lr -; THUMBV6M-NEXT: .p2align 2 -; THUMBV6M-NEXT: @ %bb.3: -; THUMBV6M-NEXT: .LCPI0_0: -; THUMBV6M-NEXT: .long 65535 @ 0xffff ; ; THUMBV8M-BASE-LABEL: fn1: ; THUMBV8M-BASE: @ %bb.0: @ %entry ; THUMBV8M-BASE-NEXT: rsbs r2, r2, #0 ; THUMBV8M-BASE-NEXT: sxth r2, r2 -; THUMBV8M-BASE-NEXT: movs r3, #0 +; THUMBV8M-BASE-NEXT: movs r3, #1 ; THUMBV8M-BASE-NEXT: adds r0, r1, r0 ; THUMBV8M-BASE-NEXT: adcs r3, r2 -; THUMBV8M-BASE-NEXT: uxth r0, r3 -; THUMBV8M-BASE-NEXT: movw r1, #65535 -; THUMBV8M-BASE-NEXT: cmp r0, r1 +; THUMBV8M-BASE-NEXT: lsls r0, r3, #16 ; THUMBV8M-BASE-NEXT: beq .LBB0_2 ; THUMBV8M-BASE-NEXT: .LBB0_1: @ %for.cond ; THUMBV8M-BASE-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -64,11 +55,9 @@ ; THUMB: @ %bb.0: @ %entry ; THUMB-NEXT: rsbs r2, r2, #0 ; THUMB-NEXT: adds r0, r0, r1 -; THUMB-NEXT: movw r1, #65535 ; THUMB-NEXT: sxth r2, r2 -; THUMB-NEXT: adc r0, r2, #0 -; THUMB-NEXT: uxth r0, r0 -; THUMB-NEXT: cmp r0, r1 +; THUMB-NEXT: adc r0, r2, #1 +; THUMB-NEXT: lsls r0, r0, #16 ; THUMB-NEXT: it eq ; THUMB-NEXT: bxeq lr ; THUMB-NEXT: .LBB0_1: @ %for.cond diff --git a/llvm/test/CodeGen/ARM/dsp-mlal.ll b/llvm/test/CodeGen/ARM/dsp-mlal.ll --- a/llvm/test/CodeGen/ARM/dsp-mlal.ll +++ b/llvm/test/CodeGen/ARM/dsp-mlal.ll @@ -6,28 +6,36 @@ define hidden i32 @SMMULR_SMMLAR(i32 %a, i32 %b0, i32 %b1, i32 %Xn, i32 %Xn1) local_unnamed_addr { ; DSP-LABEL: SMMULR_SMMLAR: ; DSP: @ %bb.0: @ %entry -; DSP-NEXT: ldr r0, [sp] -; DSP-NEXT: smmulr r0, r0, r2 -; DSP-NEXT: smmlar r0, r3, r1, r0 +; DSP-NEXT: smull r0, r1, r3, r1 +; DSP-NEXT: ldr r3, [sp] +; DSP-NEXT: smull r2, r3, r3, r2 +; DSP-NEXT: adds.w r2, r2, #-2147483648 +; DSP-NEXT: adcs r1, r3 +; DSP-NEXT: adds.w r0, r0, #-2147483648 +; DSP-NEXT: adc r0, r1, #0 ; DSP-NEXT: bx lr ; ; ARM7-LABEL: SMMULR_SMMLAR: ; ARM7: @ %bb.0: @ %entry ; ARM7-NEXT: ldr r0, [sp] -; ARM7-NEXT: smmulr r0, r0, r2 -; ARM7-NEXT: smmlar r0, r3, r1, r0 +; ARM7-NEXT: smull r1, r3, r3, r1 +; ARM7-NEXT: smull r0, r2, r0, r2 +; ARM7-NEXT: adds r0, r0, #-2147483648 +; ARM7-NEXT: adc r0, r3, r2 +; ARM7-NEXT: adds r1, r1, #-2147483648 +; ARM7-NEXT: adc r0, r0, #0 ; ARM7-NEXT: bx lr ; ; NODSP-LABEL: SMMULR_SMMLAR: ; NODSP: @ %bb.0: @ %entry -; NODSP-NEXT: push {r4, lr} -; NODSP-NEXT: ldr.w lr, [sp, #8] -; NODSP-NEXT: movs r0, #0 -; NODSP-NEXT: mov.w r4, #-2147483648 -; NODSP-NEXT: mov.w r12, #-2147483648 -; NODSP-NEXT: smlal r4, r0, lr, r2 -; NODSP-NEXT: smlal r12, r0, r3, r1 -; NODSP-NEXT: pop {r4, pc} +; NODSP-NEXT: smull r0, r1, r3, r1 +; NODSP-NEXT: ldr r3, [sp] +; NODSP-NEXT: smull r2, r3, r3, r2 +; NODSP-NEXT: adds.w r2, r2, #-2147483648 +; NODSP-NEXT: adcs r1, r3 +; NODSP-NEXT: adds.w r0, r0, #-2147483648 +; NODSP-NEXT: adc r0, r1, #0 +; NODSP-NEXT: bx lr entry: %conv = sext i32 %b1 to i64 %conv1 = sext i32 %Xn1 to i64 diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1278,8 +1278,24 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: ; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.u8 r0, d0[1] +; CHECK-NEXT: vmov.u8 r1, d0[2] +; CHECK-NEXT: vmov.u8 r2, d0[3] +; CHECK-NEXT: vmov.u8 r3, d0[4] +; CHECK-NEXT: vmov.u8 r12, d0[5] +; CHECK-NEXT: vmov.u8 lr, d0[6] +; CHECK-NEXT: vmov.u8 r4, d0[7] +; CHECK-NEXT: vmov.8 d0[1], r0 +; CHECK-NEXT: vmov.8 d0[2], r1 +; CHECK-NEXT: vmov.8 d0[3], r2 +; CHECK-NEXT: vmov.8 d0[4], r3 +; CHECK-NEXT: vmov.8 d0[5], r12 +; CHECK-NEXT: vmov.8 d0[6], lr +; CHECK-NEXT: vmov.8 d0[7], r4 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r4, pc} %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 %vecext1 = extractelement <16 x i8> %x, i32 1 @@ -1404,9 +1420,12 @@ define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, d0[0] -; CHECK-NEXT: vmov.16 d16[0], r0 -; CHECK-NEXT: vdup.16 d0, d16[0] +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 r1, d0[0] +; CHECK-NEXT: vmov d0, r1, r0 +; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vmov.16 d0[2], r1 +; CHECK-NEXT: vmov.16 d0[3], r1 ; CHECK-NEXT: bx lr entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1421,7 +1440,10 @@ define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 r1, d0[0] +; CHECK-NEXT: vmov d0, r1, r0 +; CHECK-NEXT: vmov.32 d0[1], r1 ; CHECK-NEXT: bx lr entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1487,9 +1509,11 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, d0[0] -; CHECK-NEXT: vmov.16 d16[0], r0 -; CHECK-NEXT: vdup.16 d0, d16[0] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.16 d0[1], r0 +; CHECK-NEXT: vmov.16 d0[2], r0 +; CHECK-NEXT: vmov.16 d0[3], r0 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1504,7 +1528,9 @@ define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.32 d0[1], r0 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1528,7 +1554,8 @@ define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) { ; CHECK-LABEL: test_concat_same_v1i32_v1i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov.32 r0, d0[0] +; CHECK-NEXT: vmov.32 d0[1], r0 ; CHECK-NEXT: bx lr entry: %0 = extractelement <2 x i32> %a, i32 0 @@ -1551,9 +1578,28 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, d0[0] +; CHECK-NEXT: vmov.u8 r1, d0[1] +; CHECK-NEXT: vmov.u8 r2, d0[2] +; CHECK-NEXT: vmov.u8 r3, d0[3] +; CHECK-NEXT: vmov.u8 r12, d0[4] +; CHECK-NEXT: vmov.u8 lr, d0[5] +; CHECK-NEXT: vmov.u8 r4, d0[6] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d0[7] +; CHECK-NEXT: vmov.8 d16[1], r1 +; CHECK-NEXT: vmov.8 d16[2], r2 +; CHECK-NEXT: vmov.8 d16[3], r3 +; CHECK-NEXT: vmov.8 d16[4], r12 +; CHECK-NEXT: vmov.8 d16[5], lr +; CHECK-NEXT: vmov.8 d16[6], r4 +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vorr d17, d2, d2 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: pop {r4, pc} entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1578,8 +1624,25 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: vmov.u8 r3, d2[0] +; CHECK-NEXT: vmov.u8 r0, d2[1] +; CHECK-NEXT: vmov.u8 r1, d2[2] +; CHECK-NEXT: vmov.u8 r4, d2[3] +; CHECK-NEXT: vmov.u8 r5, d2[4] +; CHECK-NEXT: vmov.u8 r2, d2[5] +; CHECK-NEXT: vmov.u8 lr, d2[6] +; CHECK-NEXT: vmov.u8 r12, d2[7] +; CHECK-NEXT: vmov.8 d1[0], r3 +; CHECK-NEXT: vmov.8 d1[1], r0 +; CHECK-NEXT: vmov.8 d1[2], r1 +; CHECK-NEXT: vmov.8 d1[3], r4 +; CHECK-NEXT: vmov.8 d1[4], r5 +; CHECK-NEXT: vmov.8 d1[5], r2 +; CHECK-NEXT: vmov.8 d1[6], lr +; CHECK-NEXT: vmov.8 d1[7], r12 +; CHECK-NEXT: pop {r4, r5, r11, pc} entry: %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1619,9 +1682,41 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vorr d16, d1, d1 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vmov.u8 r1, d0[1] +; CHECK-NEXT: vmov.u8 r3, d0[2] +; CHECK-NEXT: vmov.u8 r6, d0[3] +; CHECK-NEXT: vmov.u8 r5, d0[4] +; CHECK-NEXT: vmov.u8 r0, d0[5] +; CHECK-NEXT: vmov.u8 r2, d0[6] +; CHECK-NEXT: vmov.u8 lr, d0[7] +; CHECK-NEXT: vmov.u8 r12, d16[5] +; CHECK-NEXT: vmov.u8 r4, d16[6] +; CHECK-NEXT: vmov.8 d0[1], r1 +; CHECK-NEXT: vmov.u8 r1, d16[1] +; CHECK-NEXT: vmov.8 d0[2], r3 +; CHECK-NEXT: vmov.u8 r3, d16[3] +; CHECK-NEXT: vmov.8 d0[3], r6 +; CHECK-NEXT: vmov.u8 r6, d16[0] +; CHECK-NEXT: vmov.8 d0[4], r5 +; CHECK-NEXT: vmov.u8 r5, d16[2] +; CHECK-NEXT: vmov.8 d0[5], r0 +; CHECK-NEXT: vmov.u8 r0, d16[4] +; CHECK-NEXT: vmov.8 d0[6], r2 +; CHECK-NEXT: vmov.u8 r2, d16[7] +; CHECK-NEXT: vmov.8 d0[7], lr +; CHECK-NEXT: vmov.8 d1[0], r6 +; CHECK-NEXT: vmov.8 d1[1], r1 +; CHECK-NEXT: vmov.8 d1[2], r5 +; CHECK-NEXT: vmov.8 d1[3], r3 +; CHECK-NEXT: vmov.8 d1[4], r0 +; CHECK-NEXT: vmov.8 d1[5], r12 +; CHECK-NEXT: vmov.8 d1[6], r4 +; CHECK-NEXT: vmov.8 d1[7], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1672,6 +1767,14 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vmov.u16 r0, d0[0] +; CHECK-NEXT: vmov.u16 r1, d0[1] +; CHECK-NEXT: vmov.u16 r2, d0[2] +; CHECK-NEXT: vmov.u16 r3, d0[3] +; CHECK-NEXT: vmov.16 d0[0], r0 +; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vmov.16 d0[2], r2 +; CHECK-NEXT: vmov.16 d0[3], r3 ; CHECK-NEXT: vmov.f64 d1, d2 ; CHECK-NEXT: bx lr entry: @@ -1690,7 +1793,14 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.u16 r0, d2[0] +; CHECK-NEXT: vmov.u16 r1, d2[1] +; CHECK-NEXT: vmov.u16 r2, d2[2] +; CHECK-NEXT: vmov.u16 r3, d2[3] +; CHECK-NEXT: vmov.16 d1[0], r0 +; CHECK-NEXT: vmov.16 d1[1], r1 +; CHECK-NEXT: vmov.16 d1[2], r2 +; CHECK-NEXT: vmov.16 d1[3], r3 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1715,9 +1825,25 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.u16 r2, d0[1] +; CHECK-NEXT: vorr d16, d0, d0 +; CHECK-NEXT: vmov.u16 r3, d0[2] +; CHECK-NEXT: vmov.u16 r0, d0[3] +; CHECK-NEXT: vmov.u16 r1, d1[0] +; CHECK-NEXT: vmov.u16 r4, d1[1] +; CHECK-NEXT: vmov.u16 lr, d1[2] +; CHECK-NEXT: vmov.u16 r12, d1[3] +; CHECK-NEXT: vmov.16 d16[1], r2 +; CHECK-NEXT: vmov.16 d16[2], r3 +; CHECK-NEXT: vmov.16 d16[3], r0 +; CHECK-NEXT: vmov.16 d17[0], r1 +; CHECK-NEXT: vmov.16 d17[1], r4 +; CHECK-NEXT: vmov.16 d17[2], lr +; CHECK-NEXT: vmov.16 d17[3], r12 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: pop {r4, pc} entry: %vecext = extractelement <4 x i16> %x, i32 0 %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 @@ -1752,7 +1878,10 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 d0[1], r0 +; CHECK-NEXT: vext.32 q8, q0, q0, #2 +; CHECK-NEXT: vext.32 q0, q8, q1, #2 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <2 x i32> %x, i32 0 @@ -1766,7 +1895,10 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d2[0] +; CHECK-NEXT: vmov.32 r1, d2[1] +; CHECK-NEXT: vmov.32 d1[0], r0 +; CHECK-NEXT: vmov.32 d1[1], r1 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <4 x i32> %x, i32 0 @@ -1817,7 +1949,10 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d2[0] +; CHECK-NEXT: vmov.32 r1, d2[1] +; CHECK-NEXT: vmov.32 d1[0], r0 +; CHECK-NEXT: vmov.32 d1[1], r1 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <2 x i64> %x, i32 0 @@ -1830,8 +1965,12 @@ define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 +; CHECK-NEXT: vmov.32 r0, d1[0] +; CHECK-NEXT: vorr d16, d0, d0 +; CHECK-NEXT: vmov.32 r1, d1[1] +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r1 +; CHECK-NEXT: vorr q0, q8, q8 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <1 x i64> %x, i32 0 diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -427,9 +427,10 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind { ; CHECK-LABEL: check_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d16, r2, r3 -; CHECK-NEXT: vdup.32 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vdup.32 d2, d1[1] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x float> %v, i32 3 %1 = insertelement <2 x float> undef, float %x, i32 0 @@ -440,8 +441,10 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind { ; CHECK-LABEL: check_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d16, r2, r3 -; CHECK-NEXT: vdup.32 d16, d16[1] +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov.32 r0, d17[1] +; CHECK-NEXT: vdup.32 d16, d17[1] +; CHECK-NEXT: vmov.32 d16[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x i32> %v, i32 3 @@ -454,7 +457,9 @@ ; CHECK-LABEL: check_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u16 r0, d16[3] ; CHECK-NEXT: vdup.16 d16, d16[3] +; CHECK-NEXT: vmov.16 d16[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <8 x i16> %v, i32 3 @@ -467,7 +472,9 @@ ; CHECK-LABEL: check_i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u8 r0, d16[3] ; CHECK-NEXT: vdup.8 d16, d16[3] +; CHECK-NEXT: vmov.8 d16[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <16 x i8> %v, i32 3 diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll @@ -273,22 +273,22 @@ ; ASM32-LABEL: double_stack_va_arg: ; ASM32: # %bb.0: # %entry ; ASM32-NEXT: fadd 0, 1, 2 -; ASM32-NEXT: addi 4, 1, 128 -; ASM32-NEXT: lwz 3, 132(1) +; ASM32-NEXT: addi 3, 1, 128 +; ASM32-NEXT: lwz 4, 132(1) ; ASM32-NEXT: fadd 0, 0, 3 -; ASM32-NEXT: stw 4, -4(1) +; ASM32-NEXT: stw 3, -4(1) ; ASM32-NEXT: fadd 0, 0, 4 -; ASM32-NEXT: lwz 4, 128(1) +; ASM32-NEXT: lwz 3, 128(1) ; ASM32-NEXT: fadd 0, 0, 5 -; ASM32-NEXT: stw 3, -12(1) +; ASM32-NEXT: stw 3, -16(1) ; ASM32-NEXT: fadd 0, 0, 6 -; ASM32-NEXT: stw 4, -16(1) +; ASM32-NEXT: stw 4, -12(1) ; ASM32-NEXT: fadd 0, 0, 7 ; ASM32-NEXT: lfd 1, -16(1) ; ASM32-NEXT: fadd 0, 0, 8 -; ASM32-NEXT: stw 3, -20(1) +; ASM32-NEXT: stw 3, -24(1) ; ASM32-NEXT: fadd 0, 0, 9 -; ASM32-NEXT: stw 4, -24(1) +; ASM32-NEXT: stw 4, -20(1) ; ASM32-NEXT: fadd 0, 0, 10 ; ASM32-NEXT: fadd 0, 0, 11 ; ASM32-NEXT: fadd 0, 0, 12 @@ -386,3 +386,5 @@ ; 32BIT-DAG: STW renamable $r4, 0, %stack.2 :: (store (s32) into %stack.2, align 8) ; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm ; 32BIT-DAG: BLR implicit $lr, implicit $rm, implicit $f1 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; 32BIT: {{.*}} diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -13,10 +13,10 @@ ; CHECK-NEXT: xvredp 2, 0 ; CHECK-NEXT: xxswapd 1, 1 ; CHECK-NEXT: xxlor 3, 1, 1 -; CHECK-NEXT: xvnmsubadp 3, 0, 2 -; CHECK-NEXT: xvmaddadp 2, 2, 3 -; CHECK-NEXT: xvnmsubadp 1, 0, 2 -; CHECK-NEXT: xvnmaddadp 2, 2, 1 +; CHECK-NEXT: xvmaddadp 3, 0, 2 +; CHECK-NEXT: xvnmsubadp 2, 2, 3 +; CHECK-NEXT: xvmaddadp 1, 0, 2 +; CHECK-NEXT: xvmsubadp 2, 2, 1 ; CHECK-NEXT: xvmuldp 34, 34, 2 ; CHECK-NEXT: xvmuldp 35, 35, 2 ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll --- a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll +++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll @@ -14,14 +14,12 @@ ; CHECK-P9-LABEL: test: ; CHECK-P9: # %bb.0: # %test_entry ; CHECK-P9-NEXT: andi. r3, r6, 15 +; CHECK-P9-NEXT: li r3, 2 +; CHECK-P9-NEXT: li r10, 1 ; CHECK-P9-NEXT: lwz r4, 0(r4) ; CHECK-P9-NEXT: lwz r5, 0(r5) -; CHECK-P9-NEXT: li r11, 1 -; CHECK-P9-NEXT: addic r3, r3, -1 -; CHECK-P9-NEXT: subfe r10, r3, r3 -; CHECK-P9-NEXT: li r3, 2 -; CHECK-P9-NEXT: not r10, r10 -; CHECK-P9-NEXT: iseleq r3, r11, r3 +; CHECK-P9-NEXT: iseleq r3, r10, r3 +; CHECK-P9-NEXT: subfic r10, r3, 1 ; CHECK-P9-NEXT: add r4, r10, r4 ; CHECK-P9-NEXT: srawi r4, r4, 4 ; CHECK-P9-NEXT: addze r4, r4 @@ -68,14 +66,13 @@ ; ; CHECK-P10-LABEL: test: ; CHECK-P10: # %bb.0: # %test_entry -; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: andi. r3, r6, 15 ; CHECK-P10-NEXT: li r3, 2 ; CHECK-P10-NEXT: li r10, 1 +; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: lwz r5, 0(r5) ; CHECK-P10-NEXT: iseleq r3, r10, r3 -; CHECK-P10-NEXT: setnbc r10, eq -; CHECK-P10-NEXT: not r10, r10 +; CHECK-P10-NEXT: subfic r10, r3, 1 ; CHECK-P10-NEXT: add r4, r10, r4 ; CHECK-P10-NEXT: srawi r4, r4, 4 ; CHECK-P10-NEXT: addze r4, r4 diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll --- a/llvm/test/CodeGen/PowerPC/select_const.ll +++ b/llvm/test/CodeGen/PowerPC/select_const.ll @@ -494,7 +494,7 @@ define i8 @sel_constants_urem_constant(i1 %cond) { ; ALL-LABEL: sel_constants_urem_constant: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: subfic 3, 3, 3 ; ALL-NEXT: blr %sel = select i1 %cond, i8 -4, i8 23 @@ -529,7 +529,7 @@ define i8 @sel_constants_and_constant(i1 %cond) { ; ALL-LABEL: sel_constants_and_constant: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: subfic 3, 3, 5 ; ALL-NEXT: blr %sel = select i1 %cond, i8 -4, i8 23 @@ -610,24 +610,13 @@ } define i8 @shl_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: shl_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 4 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: shl_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 4 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB37_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB37_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: shl_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 1 +; ALL-NEXT: subfic 3, 3, 3 +; ALL-NEXT: slw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = shl i8 1, %sel ret i8 %bo @@ -658,24 +647,13 @@ } define i8 @lshr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: lshr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 16 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: lshr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 16 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB39_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB39_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: lshr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 64 +; ALL-NEXT: subfic 3, 3, 3 +; ALL-NEXT: srw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = lshr i8 64, %sel ret i8 %bo @@ -685,7 +663,7 @@ define i8 @sel_constants_ashr_constant(i1 %cond) { ; ALL-LABEL: sel_constants_ashr_constant: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: neg 3, 3 ; ALL-NEXT: blr %sel = select i1 %cond, i8 -4, i8 23 @@ -694,24 +672,13 @@ } define i8 @ashr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: ashr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, -32 -; ISEL-NEXT: li 3, -16 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: ashr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, -32 -; NO_ISEL-NEXT: li 3, -16 -; NO_ISEL-NEXT: bc 12, 1, .LBB41_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB41_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: ashr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, -128 +; ALL-NEXT: subfic 3, 3, 3 +; ALL-NEXT: sraw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = ashr i8 128, %sel ret i8 %bo diff --git a/llvm/test/CodeGen/PowerPC/testComparesigeuc.ll b/llvm/test/CodeGen/PowerPC/testComparesigeuc.ll --- a/llvm/test/CodeGen/PowerPC/testComparesigeuc.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesigeuc.ll @@ -67,8 +67,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stb r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesigeui.ll b/llvm/test/CodeGen/PowerPC/testComparesigeui.ll --- a/llvm/test/CodeGen/PowerPC/testComparesigeui.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesigeui.ll @@ -66,8 +66,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stw r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesigeus.ll b/llvm/test/CodeGen/PowerPC/testComparesigeus.ll --- a/llvm/test/CodeGen/PowerPC/testComparesigeus.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesigeus.ll @@ -66,8 +66,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: sth r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesileuc.ll b/llvm/test/CodeGen/PowerPC/testComparesileuc.ll --- a/llvm/test/CodeGen/PowerPC/testComparesileuc.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesileuc.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stb r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesileui.ll b/llvm/test/CodeGen/PowerPC/testComparesileui.ll --- a/llvm/test/CodeGen/PowerPC/testComparesileui.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesileui.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stw r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesileus.ll b/llvm/test/CodeGen/PowerPC/testComparesileus.ll --- a/llvm/test/CodeGen/PowerPC/testComparesileus.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesileus.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: sth r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllgeuc.ll b/llvm/test/CodeGen/PowerPC/testComparesllgeuc.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllgeuc.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllgeuc.ll @@ -66,8 +66,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stb r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllgeui.ll b/llvm/test/CodeGen/PowerPC/testComparesllgeui.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllgeui.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllgeui.ll @@ -66,8 +66,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stw r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllgeus.ll b/llvm/test/CodeGen/PowerPC/testComparesllgeus.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllgeus.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllgeus.ll @@ -66,8 +66,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r3, r4 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: sth r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllleuc.ll b/llvm/test/CodeGen/PowerPC/testComparesllleuc.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllleuc.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllleuc.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stb r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllleui.ll b/llvm/test/CodeGen/PowerPC/testComparesllleui.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllleui.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllleui.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: stw r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/testComparesllleus.ll b/llvm/test/CodeGen/PowerPC/testComparesllleus.ll --- a/llvm/test/CodeGen/PowerPC/testComparesllleus.ll +++ b/llvm/test/CodeGen/PowerPC/testComparesllleus.ll @@ -69,8 +69,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sub r3, r4, r3 ; CHECK-NEXT: addis r5, r2, glob@toc@ha -; CHECK-NEXT: not r3, r3 ; CHECK-NEXT: rldicl r3, r3, 1, 63 +; CHECK-NEXT: not r3, r3 +; CHECK-NEXT: clrlwi r3, r3, 31 ; CHECK-NEXT: sth r3, glob@toc@l(r5) ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll b/llvm/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll --- a/llvm/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll +++ b/llvm/test/CodeGen/PowerPC/vec_buildvector_loadstore.ll @@ -8,17 +8,85 @@ define void @foo() nounwind ssp { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -112(1) ; CHECK-NEXT: li 3, a@l ; CHECK-NEXT: lis 4, a@ha +; CHECK-NEXT: li 5, .LCPI0_0@l ; CHECK-NEXT: lvx 2, 4, 3 -; CHECK-NEXT: li 3, .LCPI0_0@l -; CHECK-NEXT: lis 4, .LCPI0_0@ha -; CHECK-NEXT: lvx 3, 4, 3 -; CHECK-NEXT: vxor 4, 4, 4 +; CHECK-NEXT: lis 3, .LCPI0_0@ha +; CHECK-NEXT: li 4, .LCPI0_1@l +; CHECK-NEXT: lvx 3, 3, 5 +; CHECK-NEXT: lis 3, .LCPI0_1@ha +; CHECK-NEXT: lvx 4, 3, 4 +; CHECK-NEXT: li 3, .LCPI0_2@l +; CHECK-NEXT: lis 4, .LCPI0_2@ha +; CHECK-NEXT: lvx 5, 4, 3 +; CHECK-NEXT: li 3, .LCPI0_3@l +; CHECK-NEXT: lis 4, .LCPI0_3@ha +; CHECK-NEXT: lvx 0, 4, 3 +; CHECK-NEXT: li 3, .LCPI0_4@l +; CHECK-NEXT: lis 4, .LCPI0_4@ha +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: lvx 1, 4, 3 +; CHECK-NEXT: li 3, .LCPI0_5@l +; CHECK-NEXT: lis 4, .LCPI0_5@ha +; CHECK-NEXT: stb 5, 16(1) +; CHECK-NEXT: li 5, .LCPI0_6@l +; CHECK-NEXT: lvx 6, 4, 3 +; CHECK-NEXT: lis 3, .LCPI0_6@ha +; CHECK-NEXT: lvx 7, 3, 5 +; CHECK-NEXT: addi 3, 1, 16 +; CHECK-NEXT: lvx 8, 0, 3 +; CHECK-NEXT: addi 3, 1, 96 +; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: li 3, .LCPI0_7@l +; CHECK-NEXT: lbz 4, 100(1) +; CHECK-NEXT: lis 5, .LCPI0_7@ha +; CHECK-NEXT: lvx 9, 5, 3 +; CHECK-NEXT: vxor 10, 10, 10 +; CHECK-NEXT: addi 3, 1, 80 +; CHECK-NEXT: stb 4, 80(1) +; CHECK-NEXT: vperm 3, 2, 10, 3 +; CHECK-NEXT: lis 5, .LCPI0_8@ha +; CHECK-NEXT: lbz 4, 101(1) +; CHECK-NEXT: vperm 3, 3, 8, 4 +; CHECK-NEXT: lvx 10, 0, 3 +; CHECK-NEXT: li 3, .LCPI0_8@l +; CHECK-NEXT: lvx 4, 5, 3 +; CHECK-NEXT: addi 3, 1, 64 +; CHECK-NEXT: stb 4, 64(1) +; CHECK-NEXT: vperm 3, 3, 8, 5 +; CHECK-NEXT: lis 5, .LCPI0_9@ha +; CHECK-NEXT: lbz 4, 104(1) +; CHECK-NEXT: vperm 3, 3, 10, 0 +; CHECK-NEXT: lvx 5, 0, 3 +; CHECK-NEXT: li 3, .LCPI0_9@l +; CHECK-NEXT: lvx 0, 5, 3 +; CHECK-NEXT: addi 3, 1, 48 +; CHECK-NEXT: stb 4, 48(1) +; CHECK-NEXT: vperm 3, 3, 5, 1 +; CHECK-NEXT: lis 5, .LCPI0_10@ha +; CHECK-NEXT: lbz 4, 105(1) +; CHECK-NEXT: vperm 3, 3, 8, 6 +; CHECK-NEXT: lvx 5, 0, 3 +; CHECK-NEXT: li 3, .LCPI0_10@l +; CHECK-NEXT: lvx 1, 5, 3 +; CHECK-NEXT: vperm 3, 3, 8, 7 +; CHECK-NEXT: addi 3, 1, 32 +; CHECK-NEXT: stb 4, 32(1) +; CHECK-NEXT: vperm 3, 3, 5, 9 +; CHECK-NEXT: lis 4, .LCPI0_11@ha +; CHECK-NEXT: lvx 5, 0, 3 +; CHECK-NEXT: li 3, .LCPI0_11@l +; CHECK-NEXT: lvx 6, 4, 3 ; CHECK-NEXT: li 3, c@l +; CHECK-NEXT: vperm 3, 3, 5, 4 ; CHECK-NEXT: lis 4, c@ha -; CHECK-NEXT: vperm 2, 4, 2, 3 +; CHECK-NEXT: vperm 3, 3, 8, 0 +; CHECK-NEXT: vperm 3, 3, 8, 1 +; CHECK-NEXT: vperm 2, 3, 2, 6 ; CHECK-NEXT: stvx 2, 4, 3 +; CHECK-NEXT: addi 1, 1, 112 ; CHECK-NEXT: blr entry: %tmp0 = load <16 x i8>, <16 x i8>* @a, align 16 diff --git a/llvm/test/CodeGen/PowerPC/vec_shuffle.ll b/llvm/test/CodeGen/PowerPC/vec_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/vec_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/vec_shuffle.ll @@ -227,10 +227,147 @@ define void @tb_h(<16 x i8>* %A, <16 x i8>* %B) { ; CHECK-LABEL: tb_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -304(1) +; CHECK-NEXT: .cfi_def_cfa_offset 304 +; CHECK-NEXT: .cfi_offset r26, -24 +; CHECK-NEXT: .cfi_offset r27, -20 +; CHECK-NEXT: .cfi_offset r28, -16 +; CHECK-NEXT: .cfi_offset r29, -12 +; CHECK-NEXT: .cfi_offset r30, -8 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_0@l +; CHECK-NEXT: lis 5, .LCPI6_0@ha +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: li 4, .LCPI6_1@l +; CHECK-NEXT: lis 5, .LCPI6_1@ha +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: li 4, .LCPI6_2@l +; CHECK-NEXT: lis 5, .LCPI6_2@ha +; CHECK-NEXT: lvx 0, 5, 4 +; CHECK-NEXT: li 4, .LCPI6_3@l +; CHECK-NEXT: lis 5, .LCPI6_3@ha ; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: lvx 1, 5, 4 +; CHECK-NEXT: li 4, .LCPI6_4@l +; CHECK-NEXT: lis 5, .LCPI6_4@ha +; CHECK-NEXT: lvx 6, 5, 4 +; CHECK-NEXT: li 4, .LCPI6_5@l +; CHECK-NEXT: lis 5, .LCPI6_5@ha +; CHECK-NEXT: lvx 7, 5, 4 +; CHECK-NEXT: addi 4, 1, 240 +; CHECK-NEXT: addi 5, 1, 256 +; CHECK-NEXT: stw 26, 280(1) # 4-byte Folded Spill +; CHECK-NEXT: lis 26, .LCPI6_6@ha +; CHECK-NEXT: stw 27, 284(1) # 4-byte Folded Spill +; CHECK-NEXT: stw 28, 288(1) # 4-byte Folded Spill +; CHECK-NEXT: stw 29, 292(1) # 4-byte Folded Spill +; CHECK-NEXT: stw 30, 296(1) # 4-byte Folded Spill +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lbz 4, 241(1) +; CHECK-NEXT: stvx 3, 0, 5 +; CHECK-NEXT: lbz 5, 257(1) +; CHECK-NEXT: stb 4, 224(1) +; CHECK-NEXT: addi 4, 1, 224 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_6@l +; CHECK-NEXT: lbz 6, 242(1) +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 4, 26, 4 +; CHECK-NEXT: addi 4, 1, 208 +; CHECK-NEXT: stb 5, 208(1) +; CHECK-NEXT: lis 5, .LCPI6_7@ha +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: addi 4, 1, 192 +; CHECK-NEXT: lbz 7, 258(1) +; CHECK-NEXT: vperm 2, 2, 3, 0 +; CHECK-NEXT: stb 6, 192(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_7@l +; CHECK-NEXT: lbz 8, 243(1) +; CHECK-NEXT: vperm 2, 2, 3, 1 +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 176 +; CHECK-NEXT: lis 5, .LCPI6_8@ha +; CHECK-NEXT: stb 7, 176(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_8@l +; CHECK-NEXT: lbz 9, 259(1) +; CHECK-NEXT: vperm 2, 2, 3, 6 +; CHECK-NEXT: lvx 0, 5, 4 +; CHECK-NEXT: addi 4, 1, 160 +; CHECK-NEXT: lis 5, .LCPI6_9@ha +; CHECK-NEXT: stb 8, 160(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: addi 4, 1, 144 +; CHECK-NEXT: lbz 10, 244(1) +; CHECK-NEXT: vperm 2, 2, 3, 7 +; CHECK-NEXT: stb 9, 144(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_9@l +; CHECK-NEXT: lbz 11, 260(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 1, 5, 4 +; CHECK-NEXT: addi 4, 1, 128 +; CHECK-NEXT: lis 5, .LCPI6_10@ha +; CHECK-NEXT: stb 10, 128(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_10@l +; CHECK-NEXT: lbz 12, 245(1) +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 112 +; CHECK-NEXT: lis 5, .LCPI6_11@ha +; CHECK-NEXT: stb 11, 112(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: addi 4, 1, 96 +; CHECK-NEXT: lbz 0, 261(1) +; CHECK-NEXT: vperm 2, 2, 3, 0 +; CHECK-NEXT: stb 12, 96(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_11@l +; CHECK-NEXT: lbz 30, 246(1) +; CHECK-NEXT: vperm 2, 2, 3, 1 +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 80 +; CHECK-NEXT: lis 5, .LCPI6_12@ha +; CHECK-NEXT: stb 0, 80(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_12@l +; CHECK-NEXT: lbz 29, 262(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 0, 5, 4 +; CHECK-NEXT: addi 4, 1, 64 +; CHECK-NEXT: lis 5, .LCPI6_13@ha +; CHECK-NEXT: stb 30, 64(1) ; CHECK-NEXT: lvx 3, 0, 4 -; CHECK-NEXT: vmrghb 2, 2, 3 +; CHECK-NEXT: addi 4, 1, 48 +; CHECK-NEXT: lbz 28, 247(1) +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: stb 29, 48(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_13@l +; CHECK-NEXT: lbz 27, 263(1) +; CHECK-NEXT: vperm 2, 2, 3, 0 +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: lis 5, .LCPI6_14@ha +; CHECK-NEXT: stb 28, 32(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI6_14@l +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: stb 27, 16(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lwz 30, 296(1) # 4-byte Folded Reload ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: lwz 29, 292(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 28, 288(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 27, 284(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 26, 280(1) # 4-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 304 ; CHECK-NEXT: blr entry: %tmp = load <16 x i8>, <16 x i8>* %A ; <<16 x i8>> [#uses=8] @@ -274,10 +411,68 @@ define void @th_h(<8 x i16>* %A, <8 x i16>* %B) { ; CHECK-LABEL: th_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -144(1) +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI7_0@l +; CHECK-NEXT: lis 5, .LCPI7_0@ha +; CHECK-NEXT: lis 10, .LCPI7_3@ha ; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: li 4, .LCPI7_1@l +; CHECK-NEXT: lis 5, .LCPI7_1@ha +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: li 4, .LCPI7_2@l +; CHECK-NEXT: lis 5, .LCPI7_2@ha +; CHECK-NEXT: lvx 0, 5, 4 +; CHECK-NEXT: addi 4, 1, 112 +; CHECK-NEXT: addi 5, 1, 128 +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lhz 4, 114(1) +; CHECK-NEXT: stvx 3, 0, 5 +; CHECK-NEXT: lhz 5, 130(1) +; CHECK-NEXT: sth 4, 96(1) +; CHECK-NEXT: addi 4, 1, 96 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI7_3@l +; CHECK-NEXT: lhz 6, 116(1) +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 4, 10, 4 +; CHECK-NEXT: addi 4, 1, 80 +; CHECK-NEXT: sth 5, 80(1) +; CHECK-NEXT: lis 5, .LCPI7_4@ha +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI7_4@l +; CHECK-NEXT: lhz 7, 132(1) +; CHECK-NEXT: vperm 2, 2, 3, 0 +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 64 +; CHECK-NEXT: lis 5, .LCPI7_5@ha +; CHECK-NEXT: sth 6, 64(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: addi 4, 1, 48 +; CHECK-NEXT: lhz 8, 118(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: sth 7, 48(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI7_5@l +; CHECK-NEXT: lhz 9, 134(1) +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: lis 5, .LCPI7_6@ha +; CHECK-NEXT: sth 8, 32(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI7_6@l +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: sth 9, 16(1) ; CHECK-NEXT: lvx 3, 0, 4 -; CHECK-NEXT: vmrghh 2, 2, 3 +; CHECK-NEXT: vperm 2, 2, 3, 5 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 144 ; CHECK-NEXT: blr entry: %tmp = load <8 x i16>, <8 x i16>* %A ; <<8 x i16>> [#uses=4] @@ -305,10 +500,36 @@ define void @tw_h(<4 x i32>* %A, <4 x i32>* %B) { ; CHECK-LABEL: tw_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -80(1) +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI8_0@l +; CHECK-NEXT: lis 5, .LCPI8_0@ha +; CHECK-NEXT: lis 6, .LCPI8_2@ha +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: li 4, .LCPI8_1@l +; CHECK-NEXT: lis 5, .LCPI8_1@ha +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 64 +; CHECK-NEXT: addi 5, 1, 48 ; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: stvx 3, 0, 4 +; CHECK-NEXT: lwz 4, 68(1) +; CHECK-NEXT: stvx 2, 0, 5 +; CHECK-NEXT: vperm 2, 3, 2, 4 +; CHECK-NEXT: lwz 5, 52(1) +; CHECK-NEXT: stw 4, 32(1) +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI8_2@l +; CHECK-NEXT: lvx 4, 6, 4 +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: stw 5, 16(1) ; CHECK-NEXT: lvx 3, 0, 4 -; CHECK-NEXT: vmrghw 2, 3, 2 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 80 ; CHECK-NEXT: blr entry: %tmp = load <4 x i32>, <4 x i32>* %A ; <<4 x i32>> [#uses=2] @@ -328,10 +549,36 @@ define void @tw_h_flop(<4 x i32>* %A, <4 x i32>* %B) { ; CHECK-LABEL: tw_h_flop: ; CHECK: # %bb.0: +; CHECK-NEXT: stwu 1, -80(1) +; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: lis 5, .LCPI9_0@ha +; CHECK-NEXT: lis 6, .LCPI9_2@ha ; CHECK-NEXT: lvx 3, 0, 4 -; CHECK-NEXT: vmrghw 2, 2, 3 +; CHECK-NEXT: li 4, .LCPI9_0@l +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: li 4, .LCPI9_1@l +; CHECK-NEXT: lis 5, .LCPI9_1@ha +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 48 +; CHECK-NEXT: addi 5, 1, 64 +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lwz 4, 52(1) +; CHECK-NEXT: stvx 3, 0, 5 +; CHECK-NEXT: lwz 5, 68(1) +; CHECK-NEXT: stw 4, 32(1) +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI9_2@l +; CHECK-NEXT: lvx 4, 6, 4 +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: stw 5, 16(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 80 ; CHECK-NEXT: blr %tmp = load <4 x i32>, <4 x i32>* %A ; <<4 x i32>> [#uses=2] %tmp2 = load <4 x i32>, <4 x i32>* %B ; <<4 x i32>> [#uses=2] @@ -445,9 +692,101 @@ define void @VMRG_UNARY_tb_h(<16 x i8>* %A, <16 x i8>* %B) { ; CHECK-LABEL: VMRG_UNARY_tb_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -144(1) +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: lvx 2, 0, 3 -; CHECK-NEXT: vmrghb 2, 2, 2 +; CHECK-NEXT: li 4, .LCPI13_0@l +; CHECK-NEXT: lis 5, .LCPI13_0@ha +; CHECK-NEXT: lis 11, .LCPI13_2@ha +; CHECK-NEXT: lvx 3, 5, 4 +; CHECK-NEXT: li 4, .LCPI13_1@l +; CHECK-NEXT: lis 5, .LCPI13_1@ha +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 128 +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 2, 3 +; CHECK-NEXT: lbz 5, 129(1) +; CHECK-NEXT: stb 5, 112(1) +; CHECK-NEXT: addi 5, 1, 112 +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_2@l +; CHECK-NEXT: lbz 6, 130(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 4, 11, 5 +; CHECK-NEXT: li 5, .LCPI13_3@l +; CHECK-NEXT: lis 11, .LCPI13_3@ha +; CHECK-NEXT: lvx 5, 11, 5 +; CHECK-NEXT: addi 5, 1, 96 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: stb 6, 96(1) +; CHECK-NEXT: lis 6, .LCPI13_4@ha +; CHECK-NEXT: lbz 7, 131(1) +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_4@l +; CHECK-NEXT: lvx 4, 6, 5 +; CHECK-NEXT: li 5, .LCPI13_5@l +; CHECK-NEXT: lis 6, .LCPI13_5@ha +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 6, 5 +; CHECK-NEXT: addi 5, 1, 80 +; CHECK-NEXT: lis 6, .LCPI13_6@ha +; CHECK-NEXT: stb 7, 80(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lbz 8, 132(1) +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_6@l +; CHECK-NEXT: lvx 4, 6, 5 +; CHECK-NEXT: li 5, .LCPI13_7@l +; CHECK-NEXT: lis 6, .LCPI13_7@ha +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 6, 5 +; CHECK-NEXT: addi 5, 1, 64 +; CHECK-NEXT: lis 6, .LCPI13_8@ha +; CHECK-NEXT: stb 8, 64(1) +; CHECK-NEXT: lbz 9, 133(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_8@l +; CHECK-NEXT: lvx 4, 6, 5 +; CHECK-NEXT: li 5, .LCPI13_9@l +; CHECK-NEXT: lis 6, .LCPI13_9@ha +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 6, 5 +; CHECK-NEXT: addi 5, 1, 48 +; CHECK-NEXT: lis 6, .LCPI13_10@ha +; CHECK-NEXT: stb 9, 48(1) +; CHECK-NEXT: lbz 10, 134(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_10@l +; CHECK-NEXT: lvx 4, 6, 5 +; CHECK-NEXT: li 5, .LCPI13_11@l +; CHECK-NEXT: lis 6, .LCPI13_11@ha +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 6, 5 +; CHECK-NEXT: addi 5, 1, 32 +; CHECK-NEXT: lis 6, .LCPI13_12@ha +; CHECK-NEXT: stb 10, 32(1) +; CHECK-NEXT: lbz 4, 135(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: li 5, .LCPI13_12@l +; CHECK-NEXT: lvx 4, 6, 5 +; CHECK-NEXT: li 5, .LCPI13_13@l +; CHECK-NEXT: lis 6, .LCPI13_13@ha +; CHECK-NEXT: stb 4, 16(1) +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 6, 5 +; CHECK-NEXT: lis 5, .LCPI13_14@ha +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI13_14@l +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 144 ; CHECK-NEXT: blr entry: %tmp = load <16 x i8>, <16 x i8>* %A ; <<16 x i8>> [#uses=16] @@ -490,9 +829,53 @@ define void @VMRG_UNARY_th_h(<8 x i16>* %A, <8 x i16>* %B) { ; CHECK-LABEL: VMRG_UNARY_th_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -80(1) +; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: lvx 2, 0, 3 -; CHECK-NEXT: vmrghh 2, 2, 2 +; CHECK-NEXT: li 4, .LCPI14_0@l +; CHECK-NEXT: lis 5, .LCPI14_0@ha +; CHECK-NEXT: lis 7, .LCPI14_2@ha +; CHECK-NEXT: lvx 3, 5, 4 +; CHECK-NEXT: li 4, .LCPI14_1@l +; CHECK-NEXT: lis 5, .LCPI14_1@ha +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 64 +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 2, 3 +; CHECK-NEXT: lhz 4, 66(1) +; CHECK-NEXT: sth 4, 48(1) +; CHECK-NEXT: addi 4, 1, 48 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI14_2@l +; CHECK-NEXT: lhz 5, 68(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 4, 7, 4 +; CHECK-NEXT: li 4, .LCPI14_3@l +; CHECK-NEXT: lis 7, .LCPI14_3@ha +; CHECK-NEXT: lvx 5, 7, 4 +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: sth 5, 32(1) +; CHECK-NEXT: lis 5, .LCPI14_4@ha +; CHECK-NEXT: lhz 6, 70(1) +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI14_4@l +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: li 4, .LCPI14_5@l +; CHECK-NEXT: lis 5, .LCPI14_5@ha +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: lvx 5, 5, 4 +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: lis 5, .LCPI14_6@ha +; CHECK-NEXT: sth 6, 16(1) +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI14_6@l +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: vperm 2, 2, 3, 5 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 80 ; CHECK-NEXT: blr entry: %tmp = load <8 x i16>, <8 x i16>* %A ; <<8 x i16>> [#uses=8] @@ -519,9 +902,29 @@ define void @VMRG_UNARY_tw_h(<4 x i32>* %A, <4 x i32>* %B) { ; CHECK-LABEL: VMRG_UNARY_tw_h: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stwu 1, -48(1) +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: lvx 2, 0, 3 -; CHECK-NEXT: vmrghw 2, 2, 2 +; CHECK-NEXT: li 4, .LCPI15_0@l +; CHECK-NEXT: lis 5, .LCPI15_0@ha +; CHECK-NEXT: lvx 3, 5, 4 +; CHECK-NEXT: li 4, .LCPI15_1@l +; CHECK-NEXT: lis 5, .LCPI15_1@ha +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: addi 4, 1, 32 +; CHECK-NEXT: lis 5, .LCPI15_2@ha +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: vperm 2, 2, 2, 3 +; CHECK-NEXT: lwz 4, 36(1) +; CHECK-NEXT: stw 4, 16(1) +; CHECK-NEXT: addi 4, 1, 16 +; CHECK-NEXT: lvx 3, 0, 4 +; CHECK-NEXT: li 4, .LCPI15_2@l +; CHECK-NEXT: vperm 2, 2, 3, 4 +; CHECK-NEXT: lvx 4, 5, 4 +; CHECK-NEXT: vperm 2, 2, 3, 4 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: addi 1, 1, 48 ; CHECK-NEXT: blr entry: %tmp = load <4 x i32>, <4 x i32>* %A ; <<4 x i32>> [#uses=4] diff --git a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector_le.ll b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector_le.ll --- a/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector_le.ll +++ b/llvm/test/CodeGen/PowerPC/vec_shuffle_p8vector_le.ll @@ -4,8 +4,19 @@ define void @VPKUDUM_unary(<2 x i64>* %A) { ; CHECK-LABEL: VPKUDUM_unary: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lvx 2, 0, 3 -; CHECK-NEXT: vpkudum 2, 2, 2 +; CHECK-NEXT: addis 4, 2, .LCPI0_0@toc@ha +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: addi 4, 4, .LCPI0_0@toc@l +; CHECK-NEXT: lvx 2, 0, 4 +; CHECK-NEXT: addis 4, 2, .LCPI0_1@toc@ha +; CHECK-NEXT: addi 4, 4, .LCPI0_1@toc@l +; CHECK-NEXT: lvx 4, 0, 4 +; CHECK-NEXT: addis 4, 2, .LCPI0_2@toc@ha +; CHECK-NEXT: vperm 2, 3, 3, 2 +; CHECK-NEXT: addi 4, 4, .LCPI0_2@toc@l +; CHECK-NEXT: vperm 2, 3, 2, 4 +; CHECK-NEXT: lvx 4, 0, 4 +; CHECK-NEXT: vperm 2, 3, 2, 4 ; CHECK-NEXT: stvx 2, 0, 3 ; CHECK-NEXT: blr entry: @@ -25,9 +36,20 @@ define void @VPKUDUM(<2 x i64>* %A, <2 x i64>* %B) { ; CHECK-LABEL: VPKUDUM: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: addis 5, 2, .LCPI1_0@toc@ha +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: lvx 4, 0, 4 +; CHECK-NEXT: addis 4, 2, .LCPI1_2@toc@ha +; CHECK-NEXT: addi 5, 5, .LCPI1_0@toc@l +; CHECK-NEXT: addi 4, 4, .LCPI1_2@toc@l +; CHECK-NEXT: lvx 2, 0, 5 +; CHECK-NEXT: addis 5, 2, .LCPI1_1@toc@ha +; CHECK-NEXT: addi 5, 5, .LCPI1_1@toc@l +; CHECK-NEXT: vperm 2, 3, 3, 2 +; CHECK-NEXT: lvx 3, 0, 5 +; CHECK-NEXT: vperm 2, 4, 2, 3 ; CHECK-NEXT: lvx 3, 0, 4 -; CHECK-NEXT: vpkudum 2, 3, 2 +; CHECK-NEXT: vperm 2, 4, 2, 3 ; CHECK-NEXT: stvx 2, 0, 3 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -2419,7 +2419,7 @@ ; ; CHECK-LE-LABEL: test80: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: mtfprwz f0, r3 +; CHECK-LE-NEXT: mtfprd f0, r3 ; CHECK-LE-NEXT: addis r4, r2, .LCPI65_0@toc@ha ; CHECK-LE-NEXT: addi r3, r4, .LCPI65_0@toc@l ; CHECK-LE-NEXT: xxspltw v2, vs0, 1 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1324,9 +1324,8 @@ ; RV32IM-NEXT: li a5, -63 ; RV32IM-NEXT: mulhu a6, a3, a5 ; RV32IM-NEXT: slli a7, a4, 6 -; RV32IM-NEXT: sub a7, a7, a4 -; RV32IM-NEXT: sub a6, a6, a7 -; RV32IM-NEXT: neg a7, a7 +; RV32IM-NEXT: sub a7, a4, a7 +; RV32IM-NEXT: add a6, a7, a6 ; RV32IM-NEXT: sltu a7, a6, a7 ; RV32IM-NEXT: mulhu t0, a4, a5 ; RV32IM-NEXT: add a7, t0, a7 @@ -1339,9 +1338,9 @@ ; RV32IM-NEXT: add t1, a7, t1 ; RV32IM-NEXT: sub t4, t1, a4 ; RV32IM-NEXT: slli t5, a1, 6 -; RV32IM-NEXT: sub t5, t5, a1 -; RV32IM-NEXT: add t5, t5, a3 -; RV32IM-NEXT: sub t6, t4, t5 +; RV32IM-NEXT: sub t5, a1, t5 +; RV32IM-NEXT: sub t5, t5, a3 +; RV32IM-NEXT: add t6, t4, t5 ; RV32IM-NEXT: sltu s0, t6, t4 ; RV32IM-NEXT: neg s1, a4 ; RV32IM-NEXT: sltu t4, t4, s1 @@ -1349,6 +1348,7 @@ ; RV32IM-NEXT: mulhu t1, a4, t2 ; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 +; RV32IM-NEXT: sltu t0, t5, t0 ; RV32IM-NEXT: slli t1, a2, 6 ; RV32IM-NEXT: sub a2, a2, t1 ; RV32IM-NEXT: mulhu a5, a1, a5 @@ -1357,9 +1357,7 @@ ; RV32IM-NEXT: sub a2, t3, a3 ; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: neg a2, t5 -; RV32IM-NEXT: sltu a2, a2, t0 -; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 ; RV32IM-NEXT: slli a2, a3, 6 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll @@ -215,18 +215,11 @@ } define void @splat_v32i1(<32 x i1>* %x, i1 %y) { -; LMULMAX2-LABEL: splat_v32i1: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: andi a1, a1, 1 -; LMULMAX2-NEXT: li a2, 32 -; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, mu -; LMULMAX2-NEXT: vmv.v.x v8, a1 -; LMULMAX2-NEXT: vmsne.vi v10, v8, 0 -; LMULMAX2-NEXT: vsm.v v10, (a0) -; LMULMAX2-NEXT: ret -; ; LMULMAX1-RV32-LABEL: splat_v32i1: ; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v8 ; LMULMAX1-RV32-NEXT: andi a1, a1, 1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1 @@ -293,18 +286,6 @@ } define void @splat_v64i1(<64 x i1>* %x, i1 %y) { -; LMULMAX2-LABEL: splat_v64i1: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: andi a1, a1, 1 -; LMULMAX2-NEXT: li a2, 32 -; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, mu -; LMULMAX2-NEXT: vmv.v.x v8, a1 -; LMULMAX2-NEXT: vmsne.vi v10, v8, 0 -; LMULMAX2-NEXT: addi a1, a0, 4 -; LMULMAX2-NEXT: vsm.v v10, (a1) -; LMULMAX2-NEXT: vsm.v v10, (a0) -; LMULMAX2-NEXT: ret -; ; LMULMAX1-RV32-LABEL: splat_v64i1: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: andi a1, a1, 1 @@ -322,6 +303,9 @@ ; ; LMULMAX1-RV64-LABEL: splat_v64i1: ; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; LMULMAX1-RV64-NEXT: vmv.v.x v8, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v8 ; LMULMAX1-RV64-NEXT: andi a1, a1, 1 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-RV64-NEXT: vmv.v.x v8, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -106,11 +106,21 @@ } define <8 x i1> @icmp_eq_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_eq_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_eq_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_eq_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmseq.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"eq", <8 x i1> %m, i32 %evl) @@ -118,11 +128,21 @@ } define <8 x i1> @icmp_eq_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_eq_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_eq_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_eq_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmseq.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"eq", <8 x i1> %m, i32 %evl) @@ -164,11 +184,21 @@ } define <8 x i1> @icmp_ne_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ne_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ne_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ne_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ne", <8 x i1> %m, i32 %evl) @@ -176,11 +206,21 @@ } define <8 x i1> @icmp_ne_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ne_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ne_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ne_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ne", <8 x i1> %m, i32 %evl) @@ -222,11 +262,21 @@ } define <8 x i1> @icmp_ugt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ugt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ugt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ugt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ugt", <8 x i1> %m, i32 %evl) @@ -234,11 +284,21 @@ } define <8 x i1> @icmp_ugt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ugt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ugt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ugt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ugt", <8 x i1> %m, i32 %evl) @@ -280,13 +340,23 @@ } define <8 x i1> @icmp_uge_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_uge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_uge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsleu.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_uge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsleu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"uge", <8 x i1> %m, i32 %evl) @@ -294,11 +364,21 @@ } define <8 x i1> @icmp_uge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_uge_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_uge_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsleu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_uge_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsleu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"uge", <8 x i1> %m, i32 %evl) @@ -340,11 +420,21 @@ } define <8 x i1> @icmp_ult_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ult_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ult_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ult_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ult", <8 x i1> %m, i32 %evl) @@ -352,11 +442,21 @@ } define <8 x i1> @icmp_ult_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ult_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ult_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ult_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ult", <8 x i1> %m, i32 %evl) @@ -398,11 +498,21 @@ } define <8 x i1> @icmp_sgt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sgt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sgt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sgt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sgt", <8 x i1> %m, i32 %evl) @@ -410,11 +520,21 @@ } define <8 x i1> @icmp_sgt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sgt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sgt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sgt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sgt", <8 x i1> %m, i32 %evl) @@ -456,13 +576,23 @@ } define <8 x i1> @icmp_sge_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sge", <8 x i1> %m, i32 %evl) @@ -470,11 +600,21 @@ } define <8 x i1> @icmp_sge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sge_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sge_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sge_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sge", <8 x i1> %m, i32 %evl) @@ -516,11 +656,21 @@ } define <8 x i1> @icmp_slt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_slt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_slt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_slt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"slt", <8 x i1> %m, i32 %evl) @@ -528,11 +678,21 @@ } define <8 x i1> @icmp_slt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_slt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_slt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_slt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"slt", <8 x i1> %m, i32 %evl) @@ -574,11 +734,21 @@ } define <8 x i1> @icmp_sle_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sle_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sle_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sle_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sle", <8 x i1> %m, i32 %evl) @@ -586,13 +756,23 @@ } define <8 x i1> @icmp_sle_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sle_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sle_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sle_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sle", <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -117,11 +117,21 @@ } define <4 x i8> @vadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -129,11 +139,21 @@ } define <4 x i8> @vadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -141,11 +161,21 @@ } define <4 x i8> @vadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -281,11 +311,21 @@ } define <8 x i8> @vadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -293,11 +333,21 @@ } define <8 x i8> @vadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -526,11 +576,21 @@ } define <2 x i16> @vadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -538,11 +598,21 @@ } define <2 x i16> @vadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -602,11 +672,21 @@ } define <4 x i16> @vadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -614,11 +694,21 @@ } define <4 x i16> @vadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -830,11 +920,21 @@ } define <2 x i32> @vadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -842,11 +942,21 @@ } define <2 x i32> @vadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll @@ -143,11 +143,21 @@ } define <4 x i8> @vand_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -155,11 +165,21 @@ } define <4 x i8> @vand_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -219,11 +239,21 @@ } define <8 x i8> @vand_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -231,11 +261,21 @@ } define <8 x i8> @vand_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -371,11 +411,21 @@ } define <2 x i16> @vand_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -383,11 +433,21 @@ } define <2 x i16> @vand_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -447,11 +507,21 @@ } define <4 x i16> @vand_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -459,11 +529,21 @@ } define <4 x i16> @vand_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -675,11 +755,21 @@ } define <2 x i32> @vand_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -687,11 +777,21 @@ } define <2 x i32> @vand_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vdiv_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sdiv.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vdiv_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vdiv.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -158,11 +178,21 @@ } define <8 x i8> @vdiv_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sdiv.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -170,11 +200,21 @@ } define <8 x i8> @vdiv_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -258,11 +298,21 @@ } define <2 x i16> @vdiv_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sdiv.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -270,11 +320,21 @@ } define <2 x i16> @vdiv_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vdiv.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -308,11 +368,21 @@ } define <4 x i16> @vdiv_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sdiv.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -320,11 +390,21 @@ } define <4 x i16> @vdiv_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -458,11 +538,21 @@ } define <2 x i32> @vdiv_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sdiv.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -470,11 +560,21 @@ } define <2 x i32> @vdiv_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vdivu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.udiv.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vdivu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vdivu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vdivu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -157,11 +177,21 @@ } define <8 x i8> @vdivu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.udiv.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -169,11 +199,21 @@ } define <8 x i8> @vdivu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -257,11 +297,21 @@ } define <2 x i16> @vdivu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.udiv.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -269,11 +319,21 @@ } define <2 x i16> @vdivu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vdivu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vdivu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -307,11 +367,21 @@ } define <4 x i16> @vdivu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.udiv.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -319,11 +389,21 @@ } define <4 x i16> @vdivu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -457,11 +537,21 @@ } define <2 x i32> @vdivu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -469,11 +559,21 @@ } define <2 x i32> @vdivu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll @@ -91,11 +91,21 @@ } define <4 x i8> @vmul_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.mul.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -103,11 +113,21 @@ } define <4 x i8> @vmul_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -141,11 +161,21 @@ } define <8 x i8> @vmul_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.mul.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -153,11 +183,21 @@ } define <8 x i8> @vmul_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -241,11 +281,21 @@ } define <2 x i16> @vmul_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.mul.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -253,11 +303,21 @@ } define <2 x i16> @vmul_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -291,11 +351,21 @@ } define <4 x i16> @vmul_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.mul.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -303,11 +373,21 @@ } define <4 x i16> @vmul_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -503,11 +583,21 @@ } define <2 x i32> @vmul_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.mul.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -515,11 +605,21 @@ } define <2 x i32> @vmul_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll @@ -117,11 +117,21 @@ } define <4 x i8> @vor_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.or.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -129,11 +139,21 @@ } define <4 x i8> @vor_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.or.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -141,11 +161,21 @@ } define <4 x i8> @vor_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -281,11 +311,21 @@ } define <8 x i8> @vor_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.or.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -293,11 +333,21 @@ } define <8 x i8> @vor_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -433,11 +483,21 @@ } define <2 x i16> @vor_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.or.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -445,11 +505,21 @@ } define <2 x i16> @vor_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -509,11 +579,21 @@ } define <4 x i16> @vor_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.or.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -521,11 +601,21 @@ } define <4 x i16> @vor_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -737,11 +827,21 @@ } define <2 x i32> @vor_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.or.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -749,11 +849,21 @@ } define <2 x i32> @vor_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -83,11 +83,21 @@ } define <4 x i8> @vpmerge_vx_v4i8(i8 %a, <4 x i8> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %a, i32 0 %va = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> %m, <4 x i8> %va, <4 x i8> %vb, i32 %evl) @@ -194,11 +204,21 @@ } define <8 x i8> @vpmerge_vx_v8i8(i8 %a, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %a, i32 0 %va = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> %m, <8 x i8> %va, <8 x i8> %vb, i32 %evl) @@ -268,11 +288,21 @@ } define <2 x i16> @vpmerge_vx_v2i16(i16 %a, <2 x i16> %vb, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %a, i32 0 %va = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> %m, <2 x i16> %va, <2 x i16> %vb, i32 %evl) @@ -305,11 +335,21 @@ } define <4 x i16> @vpmerge_vx_v4i16(i16 %a, <4 x i16> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %a, i32 0 %va = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> %m, <4 x i16> %va, <4 x i16> %vb, i32 %evl) @@ -416,11 +456,21 @@ } define <2 x i32> @vpmerge_vx_v2i32(i32 %a, <2 x i32> %vb, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %a, i32 0 %va = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> %m, <2 x i32> %va, <2 x i32> %vb, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vrem_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vrem.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vrem.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.srem.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vrem_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vrem.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vrem.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -158,11 +178,21 @@ } define <8 x i8> @vrem_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.srem.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -170,11 +200,21 @@ } define <8 x i8> @vrem_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -258,11 +298,21 @@ } define <2 x i16> @vrem_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vrem.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vrem.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.srem.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -270,11 +320,21 @@ } define <2 x i16> @vrem_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vrem.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vrem.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -308,11 +368,21 @@ } define <4 x i16> @vrem_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.srem.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -320,11 +390,21 @@ } define <4 x i16> @vrem_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -458,11 +538,21 @@ } define <2 x i32> @vrem_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.srem.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -470,11 +560,21 @@ } define <2 x i32> @vrem_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vremu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vremu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vremu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.urem.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vremu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vremu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vremu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -157,11 +177,21 @@ } define <8 x i8> @vremu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.urem.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -169,11 +199,21 @@ } define <8 x i8> @vremu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -257,11 +297,21 @@ } define <2 x i16> @vremu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vremu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vremu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.urem.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -269,11 +319,21 @@ } define <2 x i16> @vremu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vremu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vremu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -307,11 +367,21 @@ } define <4 x i16> @vremu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.urem.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -319,11 +389,21 @@ } define <4 x i16> @vremu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -457,11 +537,21 @@ } define <2 x i32> @vremu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.urem.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -469,11 +559,21 @@ } define <2 x i32> @vremu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll @@ -61,11 +61,21 @@ declare <4 x i8> @llvm.vp.sub.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) define <4 x i8> @vrsub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sub.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -73,11 +83,21 @@ } define <4 x i8> @vrsub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vrsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -115,11 +135,21 @@ declare <8 x i8> @llvm.vp.sub.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) define <8 x i8> @vrsub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sub.v8i8(<8 x i8> %vb, <8 x i8> %va, <8 x i1> %m, i32 %evl) @@ -127,11 +157,21 @@ } define <8 x i8> @vrsub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -223,11 +263,21 @@ declare <2 x i16> @llvm.vp.sub.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) define <2 x i16> @vrsub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sub.v2i16(<2 x i16> %vb, <2 x i16> %va, <2 x i1> %m, i32 %evl) @@ -235,11 +285,21 @@ } define <2 x i16> @vrsub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vrsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -277,11 +337,21 @@ declare <4 x i16> @llvm.vp.sub.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) define <4 x i16> @vrsub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sub.v4i16(<4 x i16> %vb, <4 x i16> %va, <4 x i1> %m, i32 %evl) @@ -289,11 +359,21 @@ } define <4 x i16> @vrsub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -439,11 +519,21 @@ declare <2 x i32> @llvm.vp.sub.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) define <2 x i32> @vrsub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sub.v2i32(<2 x i32> %vb, <2 x i32> %va, <2 x i1> %m, i32 %evl) @@ -451,11 +541,21 @@ } define <2 x i32> @vrsub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll @@ -53,11 +53,20 @@ } define <4 x i8> @sadd_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: sadd_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -89,11 +98,20 @@ } define <8 x i8> @sadd_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: sadd_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -161,11 +179,20 @@ } define <2 x i16> @sadd_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: sadd_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -197,11 +224,20 @@ } define <4 x i16> @sadd_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: sadd_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -305,11 +341,20 @@ } define <2 x i32> @sadd_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: sadd_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll @@ -53,11 +53,20 @@ } define <4 x i8> @uadd_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: uadd_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -89,11 +98,20 @@ } define <8 x i8> @uadd_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: uadd_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -161,11 +179,20 @@ } define <2 x i16> @uadd_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: uadd_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -197,11 +224,20 @@ } define <4 x i16> @uadd_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: uadd_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -305,11 +341,20 @@ } define <2 x i32> @uadd_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: uadd_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll @@ -132,11 +132,21 @@ } define <4 x i8> @vsll_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsll.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsll.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.shl.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -144,11 +154,21 @@ } define <4 x i8> @vsll_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -208,11 +228,21 @@ } define <8 x i8> @vsll_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.shl.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -220,11 +250,21 @@ } define <8 x i8> @vsll_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -360,11 +400,21 @@ } define <2 x i16> @vsll_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsll.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsll.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.shl.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -372,11 +422,21 @@ } define <2 x i16> @vsll_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -436,11 +496,21 @@ } define <4 x i16> @vsll_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.shl.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -448,11 +518,21 @@ } define <4 x i16> @vsll_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -664,11 +744,21 @@ } define <2 x i32> @vsll_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.shl.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -676,11 +766,21 @@ } define <2 x i32> @vsll_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll @@ -122,11 +122,21 @@ } define <4 x i8> @vsra_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsra.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsra.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.ashr.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -134,11 +144,21 @@ } define <4 x i8> @vsra_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsra.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsra.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -210,11 +230,21 @@ } define <8 x i8> @vsra_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.ashr.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -222,11 +252,21 @@ } define <8 x i8> @vsra_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -362,11 +402,21 @@ } define <2 x i16> @vsra_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsra.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsra.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.ashr.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -374,11 +424,21 @@ } define <2 x i16> @vsra_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsra.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsra.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -438,11 +498,21 @@ } define <4 x i16> @vsra_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.ashr.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -450,11 +520,21 @@ } define <4 x i16> @vsra_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -666,11 +746,21 @@ } define <2 x i32> @vsra_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.ashr.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -678,11 +768,21 @@ } define <2 x i32> @vsra_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll @@ -121,11 +121,21 @@ } define <4 x i8> @vsrl_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.lshr.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -133,11 +143,21 @@ } define <4 x i8> @vsrl_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -209,11 +229,21 @@ } define <8 x i8> @vsrl_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.lshr.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -221,11 +251,21 @@ } define <8 x i8> @vsrl_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -361,11 +401,21 @@ } define <2 x i16> @vsrl_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.lshr.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -373,11 +423,21 @@ } define <2 x i16> @vsrl_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -437,11 +497,21 @@ } define <4 x i16> @vsrl_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.lshr.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -449,11 +519,21 @@ } define <4 x i16> @vsrl_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -665,11 +745,21 @@ } define <2 x i32> @vsrl_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.lshr.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -677,11 +767,21 @@ } define <2 x i32> @vsrl_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll @@ -54,11 +54,20 @@ } define <4 x i8> @ssub_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: ssub_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -91,11 +100,20 @@ } define <8 x i8> @ssub_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: ssub_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -165,11 +183,20 @@ } define <2 x i16> @ssub_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: ssub_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -202,11 +229,20 @@ } define <4 x i16> @ssub_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: ssub_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -313,11 +349,20 @@ } define <2 x i32> @ssub_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: ssub_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll @@ -54,11 +54,20 @@ } define <4 x i8> @usub_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: usub_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -91,11 +100,20 @@ } define <8 x i8> @usub_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: usub_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -165,11 +183,20 @@ } define <2 x i16> @usub_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: usub_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -202,11 +229,20 @@ } define <4 x i16> @usub_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: usub_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -313,11 +349,20 @@ } define <2 x i32> @usub_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: usub_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll @@ -141,11 +141,21 @@ } define <4 x i8> @vsub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sub.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -153,11 +163,21 @@ } define <4 x i8> @vsub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -191,11 +211,21 @@ } define <8 x i8> @vsub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sub.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -203,11 +233,21 @@ } define <8 x i8> @vsub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -291,11 +331,21 @@ } define <2 x i16> @vsub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sub.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -303,11 +353,21 @@ } define <2 x i16> @vsub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -341,11 +401,21 @@ } define <4 x i16> @vsub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sub.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -353,11 +423,21 @@ } define <4 x i16> @vsub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -491,11 +571,21 @@ } define <2 x i32> @vsub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sub.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -503,11 +593,21 @@ } define <2 x i32> @vsub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -420,12 +420,22 @@ } define <4 x i16> @vwadd_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwadd_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwadd.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwadd.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -436,12 +446,22 @@ } define <2 x i32> @vwadd_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwadd_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwadd.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwadd.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -452,12 +472,22 @@ } define <8 x i16> @vwadd_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwadd_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -468,12 +498,22 @@ } define <4 x i32> @vwadd_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwadd_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -484,12 +524,22 @@ } define <2 x i64> @vwadd_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwadd_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -420,12 +420,22 @@ } define <4 x i16> @vwaddu_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwaddu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwaddu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwaddu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -436,12 +446,22 @@ } define <2 x i32> @vwaddu_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwaddu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwaddu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwaddu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -452,12 +472,22 @@ } define <8 x i16> @vwaddu_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwaddu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -468,12 +498,22 @@ } define <4 x i32> @vwaddu_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwaddu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -484,12 +524,22 @@ } define <2 x i64> @vwaddu_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwaddu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -423,12 +423,22 @@ } define <4 x i16> @vwmul_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwmul_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmul.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -439,12 +449,22 @@ } define <2 x i32> @vwmul_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwmul_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmul.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -455,12 +475,22 @@ } define <8 x i16> @vwmul_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwmul_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -471,12 +501,22 @@ } define <4 x i32> @vwmul_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwmul_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -487,12 +527,22 @@ } define <2 x i64> @vwmul_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwmul_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -456,12 +456,22 @@ } define <4 x i16> @vwmulsu_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmulsu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwmulsu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -472,12 +482,22 @@ } define <2 x i32> @vwmulsu_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwmulsu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmulsu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwmulsu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -488,12 +508,22 @@ } define <8 x i16> @vwmulsu_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwmulsu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -504,12 +534,22 @@ } define <4 x i32> @vwmulsu_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -520,12 +560,22 @@ } define <2 x i64> @vwmulsu_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwmulsu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -420,12 +420,22 @@ } define <4 x i16> @vwsub_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsub.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwsub.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -436,12 +446,22 @@ } define <2 x i32> @vwsub_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsub.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwsub.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -452,12 +472,22 @@ } define <8 x i16> @vwsub_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwsub_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -468,12 +498,22 @@ } define <4 x i32> @vwsub_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwsub_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -484,12 +524,22 @@ } define <2 x i64> @vwsub_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwsub_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -420,12 +420,22 @@ } define <4 x i16> @vwsubu_vx_v4i16(<4 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwsubu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsubu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwsubu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -436,12 +446,22 @@ } define <2 x i32> @vwsubu_vx_v2i32(<2 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwsubu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsubu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwsubu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, <2 x i16>* %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -452,12 +472,22 @@ } define <8 x i16> @vwsubu_vx_v8i16(<8 x i8>* %x, i8 %y) { -; CHECK-LABEL: vwsubu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -468,12 +498,22 @@ } define <4 x i32> @vwsubu_vx_v4i32(<4 x i16>* %x, i16 %y) { -; CHECK-LABEL: vwsubu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -484,12 +524,22 @@ } define <2 x i64> @vwsubu_vx_v2i64(<2 x i32>* %x, i32 %y) { -; CHECK-LABEL: vwsubu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll @@ -155,11 +155,21 @@ } define <4 x i8> @vxor_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vxor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.xor.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -167,11 +177,21 @@ } define <4 x i8> @vxor_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vxor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -257,11 +277,21 @@ } define <8 x i8> @vxor_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.xor.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -269,11 +299,21 @@ } define <8 x i8> @vxor_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -563,11 +603,21 @@ } define <2 x i16> @vxor_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vxor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.xor.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -575,11 +625,21 @@ } define <2 x i16> @vxor_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vxor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -665,11 +725,21 @@ } define <4 x i16> @vxor_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.xor.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -677,11 +747,21 @@ } define <4 x i16> @vxor_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -971,11 +1051,21 @@ } define <2 x i32> @vxor_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.xor.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -983,11 +1073,21 @@ } define <2 x i32> @vxor_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -4286,12 +4286,15 @@ ; CHECK-LABEL: sink_splat_mul_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB81_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmul.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB81_1 @@ -4322,12 +4325,15 @@ ; CHECK-LABEL: sink_splat_add_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB82_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vadd.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB82_1 @@ -4358,12 +4364,15 @@ ; CHECK-LABEL: sink_splat_sub_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB83_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsub.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsub.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB83_1 @@ -4394,12 +4403,15 @@ ; CHECK-LABEL: sink_splat_rsub_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB84_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vrsub.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB84_1 @@ -4430,12 +4442,15 @@ ; CHECK-LABEL: sink_splat_and_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB85_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vand.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB85_1 @@ -4466,12 +4481,15 @@ ; CHECK-LABEL: sink_splat_or_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB86_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vor.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vor.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB86_1 @@ -4502,12 +4520,15 @@ ; CHECK-LABEL: sink_splat_xor_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB87_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vxor.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB87_1 diff --git a/llvm/test/CodeGen/SystemZ/pr36164.ll b/llvm/test/CodeGen/SystemZ/pr36164.ll --- a/llvm/test/CodeGen/SystemZ/pr36164.ll +++ b/llvm/test/CodeGen/SystemZ/pr36164.ll @@ -17,18 +17,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lhi %r0, 1 ; CHECK-NEXT: larl %r1, g_938 -; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lhi %r2, 3 ; CHECK-NEXT: lhi %r3, 4 ; CHECK-NEXT: larl %r4, g_11 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strl %r0, g_73 -; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -2993,6 +2993,7 @@ ; CHECK-NEXT: vldrb.u16 q1, [r1], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vstrb.16 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB20_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll @@ -35,8 +35,30 @@ define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v8i8_b: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q0, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i8> %s0 to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -37,8 +37,30 @@ define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v8i8_b: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q0, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i8> %s0 to <8 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -190,12 +190,11 @@ ; CHECK-LABEL: half_shuffle_i32x4: ; CHECK: .functype half_shuffle_i32x4 (v128) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 0, 0, 0 -; CHECK-NEXT: i32.const $push1=, 0 -; CHECK-NEXT: i32x4.replace_lane $push2=, $pop0, 0, $pop1 -; CHECK-NEXT: i32.const $push3=, 3 -; CHECK-NEXT: i32x4.replace_lane $push4=, $pop2, 3, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.shuffle $push1=, $0, $pop0, 16, 17, 18, 19, 8, 9, 10, 11, 0, 1, 2, 3, 0, 0, 0, 0 +; CHECK-NEXT: i32.const $push2=, 3 +; CHECK-NEXT: i32x4.replace_lane $push3=, $pop1, 3, $pop2 +; CHECK-NEXT: return $pop3 %s0 = extractelement <4 x i32> %src, i32 0 %s2 = extractelement <4 x i32> %src, i32 2 %v0 = insertelement <4 x i32> undef, i32 0, i32 0 diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,12 +17,12 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq i(%rip), %rsi -; CHECK-NEXT: movq j(%rip), %rax -; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: shrq $8, %rdx +; CHECK-NEXT: movl i(%rip), %esi +; CHECK-NEXT: movl j(%rip), %eax +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: cbtw ; CHECK-NEXT: idivb %dl ; CHECK-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll --- a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll +++ b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll @@ -8,13 +8,12 @@ define void @foo(i8 %arg4, i32 %arg5, i32* %arg14) nounwind { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $32, %edi -; CHECK-NEXT: leal 13(%rdi), %eax -; CHECK-NEXT: xorb $-14, %al -; CHECK-NEXT: addb $82, %al +; CHECK-NEXT: andl %edi, %esi +; CHECK-NEXT: andb $32, %dil +; CHECK-NEXT: movb $81, %al +; CHECK-NEXT: subb %dil, %al +; CHECK-NEXT: testb $32, %sil ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: testl %esi, %edi ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: cmovnel %eax, %ecx ; CHECK-NEXT: xorb $81, %cl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -746,38 +746,34 @@ define i32 @add_U320_without_i128_add(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax -; CHECK-NEXT: leaq (%rax,%rcx), %r10 +; CHECK-NEXT: movq 24(%rdi), %r10 +; CHECK-NEXT: movq 32(%rdi), %r11 ; CHECK-NEXT: addq %rsi, (%rdi) ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: leaq (%r8,%r11), %r14 -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: cmpq %r10, %rdx -; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r14, %rbx -; CHECK-NEXT: movq 32(%rdi), %r10 -; CHECK-NEXT: leaq (%r9,%r10), %rcx -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpq %r14, %rbx -; CHECK-NEXT: setb %sil -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: movq %r10, %rcx +; CHECK-NEXT: adcq %r8, %rcx +; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: leaq (%r11,%r9), %rbx +; CHECK-NEXT: addq %r8, %r10 +; CHECK-NEXT: movq %r11, %rsi +; CHECK-NEXT: adcq %r9, %rsi +; CHECK-NEXT: cmpq %r10, %rcx +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %rcx, %rsi +; CHECK-NEXT: cmpq %rbx, %rsi ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: addq %r9, %r11 ; CHECK-NEXT: movq %rdx, 16(%rdi) -; CHECK-NEXT: movq %rbx, 24(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: movq %rsi, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -15,9 +15,9 @@ ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -215,9 +215,8 @@ ; KNL-LABEL: fsub_noundef_ee: ; KNL: # %bb.0: ; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0 -; KNL-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; KNL-NEXT: vsubpd %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fsub_noundef_ee: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -5917,31 +5917,74 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) nounwind readonly define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_cmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8] -; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_cmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_cmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -5967,23 +6010,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X86-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -5992,23 +6042,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X64-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6034,31 +6094,74 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_ucmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_ucmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_ucmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6084,23 +6187,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6109,23 +6219,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -73,9 +73,8 @@ ; ; X86-LABEL: mask8_zext: ; X86: ## %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: notb %al -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl $255, %eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1967,16 +1967,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2140,16 +2140,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4846,7 +4846,8 @@ ; X64-LABEL: test_cmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %r8 # encoding: [0xc4,0x61,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] @@ -4946,7 +4947,8 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %r8 # encoding: [0xc4,0x61,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] @@ -5040,7 +5042,8 @@ ; X64-LABEL: test_ucmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %r8 # encoding: [0xc4,0x61,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] @@ -5140,7 +5143,8 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %r8 # encoding: [0xc4,0x61,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] @@ -5186,31 +5190,71 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_cmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8] -; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc8] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: kshiftrq $32, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc1,0x20] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5236,23 +5280,32 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X86-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k2 # encoding: [0xc4,0xe1,0xfc,0x4b,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc2,0x20] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5261,23 +5314,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X64-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5303,31 +5361,71 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_ucmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc8] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: kshiftrq $32, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc1,0x20] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5353,23 +5451,32 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k2 # encoding: [0xc4,0xe1,0xfc,0x4b,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc2,0x20] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5378,23 +5485,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5420,30 +5532,69 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_cmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8] -; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc8] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: kshiftrq $32, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc1,0x20] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5469,23 +5620,32 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k2 # encoding: [0xc4,0xe1,0xfc,0x4b,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc2,0x20] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5493,23 +5653,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5534,30 +5699,69 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_ucmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc8] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: kshiftrq $32, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc1,0x20] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5583,23 +5787,32 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kunpckdq %k0, %k0, %k2 # encoding: [0xc4,0xe1,0xfc,0x4b,0xd0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k0 # encoding: [0xc4,0xe3,0xf9,0x31,0xc2,0x20] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5607,23 +5820,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -14,6 +14,7 @@ ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -31,6 +32,7 @@ ; X86-NEXT: andl $21845, %eax # imm = 0x5555 ; X86-NEXT: leal (%eax,%edx,2), %eax ; X86-NEXT: rolw $8, %cx +; X86-NEXT: movzwl %cx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -290,6 +292,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: shll $4, %ecx @@ -311,19 +314,19 @@ ; ; X64-LABEL: test_bitreverse_i16: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: rolw $8, %di -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $3855, %ecx # imm = 0xF0F +; X64-NEXT: shll $4, %ecx +; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F -; X64-NEXT: shll $4, %eax -; X64-NEXT: shrl $4, %edi -; X64-NEXT: andl $3855, %edi # imm = 0xF0F -; X64-NEXT: orl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $13107, %ecx # imm = 0x3333 +; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $13107, %edi # imm = 0x3333 -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $21845, %ecx # imm = 0x5555 ; X64-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -50,12 +50,12 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_negative_zero_1: @@ -80,19 +80,14 @@ ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'. define <2 x double> @test_negative_zero_2(<2 x double> %A) { -; SSE2-LABEL: test_negative_zero_2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_negative_zero_2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: test_negative_zero_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: test_negative_zero_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -78,6 +78,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant1: ; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -94,6 +96,8 @@ ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -574,7 +574,7 @@ define i16 @test_i16_2032_mask_lshr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -594,7 +594,7 @@ define i16 @test_i16_2032_mask_lshr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -614,7 +614,7 @@ define i16 @test_i16_2032_mask_lshr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -755,7 +755,7 @@ define i16 @test_i16_2032_mask_ashr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -775,7 +775,7 @@ define i16 @test_i16_2032_mask_ashr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -795,7 +795,7 @@ define i16 @test_i16_2032_mask_ashr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -194,10 +194,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -207,9 +209,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -219,10 +224,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel diff --git a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll --- a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll +++ b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll @@ -73,7 +73,7 @@ define i16 @test5(i16 %f12) nounwind { ; i686-LABEL: test5: ; i686: # %bb.0: -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; i686-NEXT: shrl $6, %eax ; i686-NEXT: movsbl %al, %eax ; i686-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -1127,46 +1127,68 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f64_u123: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: movapd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_u123: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: PR34724_add_v4f64_u123: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3] +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,1,11] +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512-SLOW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: PR34724_add_v4f64_u123: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,1,8] +; AVX512-FAST-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> @@ -1200,21 +1222,49 @@ ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_0u23: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [0,1,1,11] +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512-SLOW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,1,1,8] +; AVX512-FAST-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1246,28 +1296,42 @@ ; SSE-FAST-NEXT: movapd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> @@ -1299,22 +1363,39 @@ ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_012u: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %5 = fadd <2 x double> %3, %4 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -32,17 +32,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,18 +50,6 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq -; -; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -126,34 +114,28 @@ ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %7 = add <2 x i32> %5, %6 @@ -191,15 +173,14 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -264,43 +245,43 @@ ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[3,1] +; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[3,1] +; AVX2-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -440,9 +421,11 @@ ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -461,9 +444,11 @@ ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -751,16 +736,16 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -775,15 +760,15 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -961,36 +946,37 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm4, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: addps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] @@ -998,27 +984,31 @@ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1030,10 +1020,12 @@ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) @@ -1050,100 +1042,94 @@ ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 -; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm2 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm3 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[2,0] ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %eax +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %edx +; AVX-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vmovd %xmm2, %eax +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq ; -; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq +; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovd %xmm2, %eax +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2283,18 +2283,34 @@ ; SSE-NEXT: divl %ecx ; SSE-NEXT: retq ; -; AVX-LABEL: PR44139: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: leal 2147483647(%rax), %ecx -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %ecx -; AVX-NEXT: retq +; AVX1OR2-LABEL: PR44139: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: movl (%rdi), %eax +; AVX1OR2-NEXT: leal 2147483647(%rax), %ecx +; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: cmovnsl %eax, %ecx +; AVX1OR2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX1OR2-NEXT: addl %eax, %ecx +; AVX1OR2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1OR2-NEXT: xorl %edx, %edx +; AVX1OR2-NEXT: divl %ecx +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: PR44139: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: leal 2147483647(%rax), %ecx +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: cmovnsl %eax, %ecx +; AVX512-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX512-NEXT: addl %eax, %ecx +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: divl %ecx +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; X86AVX2-LABEL: PR44139: ; X86AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -318,9 +318,8 @@ ; ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: notl %eax -; CHECK-64-NEXT: movzwl %ax, %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx ; CHECK-64-NEXT: orq %rax, %rcx diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll --- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll +++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll @@ -5,10 +5,9 @@ ; CHECK-LABEL: csrot_: ; CHECK: # %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: xorps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],mem[1,2,3] -; CHECK-NEXT: movlps %xmm1, (%rax) +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; CHECK-NEXT: movlps %xmm0, (%rax) ; CHECK-NEXT: retq 1: %2 = load float, float* %0, align 4 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -108,14 +108,32 @@ } define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync { -; SSE-LABEL: load_float4_float3_as_float2_float: -; SSE: # %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_as_float2_float: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_as_float2_float: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_as_float2_float: +; SSE41: # %bb.0: +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to <2 x float>* %3 = load <2 x float>, <2 x float>* %2, align 4 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -51,7 +51,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -60,7 +60,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -133,7 +133,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -145,7 +145,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -429,7 +429,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -441,7 +441,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1218,7 +1218,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1228,7 +1228,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1326,7 +1326,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1339,7 +1339,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1596,7 +1596,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1612,7 +1612,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2507,7 +2507,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2517,7 +2517,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2615,7 +2615,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2628,7 +2628,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2885,7 +2885,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2901,7 +2901,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -134,31 +134,31 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -596,40 +596,40 @@ ; SSE-LABEL: test_zero_v8f64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: movntiq %rax, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movntiq %rax, 8(%rdi) ; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movntiq %rax, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movntiq %rax, 40(%rdi) ; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f64_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1 ret void @@ -639,67 +639,67 @@ ; SSE2-LABEL: test_zero_v16f32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16f32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) +; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 40(%rdi) +; SSE4A-NEXT: movntiq %rax, 56(%rdi) ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16f32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 ret void @@ -709,66 +709,66 @@ ; SSE2-LABEL: test_zero_v8i64_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v8i64_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v8i64_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 ret void @@ -778,66 +778,66 @@ ; SSE2-LABEL: test_zero_v16i32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16i32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16i32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 ret void @@ -847,66 +847,66 @@ ; SSE2-LABEL: test_zero_v32i16_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v32i16_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v32i16_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 ret void @@ -916,66 +916,66 @@ ; SSE2-LABEL: test_zero_v64i8_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v64i8_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v64i8_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1 ret void diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -320,43 +320,41 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: packssdw %xmm7, %xmm8 ; SSE2-NEXT: pmulhw %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: packssdw %xmm5, %xmm6 -; SSE2-NEXT: pmulhw %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm7 ; SSE41-NEXT: pand %xmm6, %xmm8 ; SSE41-NEXT: packusdw %xmm7, %xmm8 ; SSE41-NEXT: pmulhw %xmm2, %xmm8 -; SSE41-NEXT: pand %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: packusdw %xmm5, %xmm6 -; SSE41-NEXT: pmulhw %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: retq ; @@ -421,13 +419,6 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: ashr_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 @@ -435,25 +426,32 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ashr_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm5 ; SSE41-NEXT: psrld $16, %xmm4 ; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -62,7 +62,7 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X86-LABEL: cnt16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 @@ -1525,7 +1525,7 @@ define i32 @popcount_i16_zext(i16 zeroext %x) { ; X86-LABEL: popcount_i16_zext: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll --- a/llvm/test/CodeGen/X86/pr42727.ll +++ b/llvm/test/CodeGen/X86/pr42727.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd (%eax), %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86 @@ -62,21 +62,33 @@ } define i1 @intrinsic_v8i8(i8* align 1 %arg, i8* align 1 %arg1) { -; SSE-LABEL: intrinsic_v8i8: -; SSE: # %bb.0: # %bb -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: cmpb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: intrinsic_v8i8: +; SSE2: # %bb.0: # %bb +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE42-LABEL: intrinsic_v8i8: +; SSE42: # %bb.0: # %bb +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE42-NEXT: packsswb %xmm1, %xmm1 +; SSE42-NEXT: pmovmskb %xmm1, %eax +; SSE42-NEXT: cmpb $-1, %al +; SSE42-NEXT: sete %al +; SSE42-NEXT: retq ; ; AVX-LABEL: intrinsic_v8i8: ; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: cmpb $-1, %al ; AVX-NEXT: sete %al @@ -86,9 +98,10 @@ ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpmovmskb %xmm0, %eax ; X86-NEXT: cmpb $-1, %al ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -42,13 +42,13 @@ ; ; AVX-64-LABEL: zext_i8: ; AVX-64: # %bb.0: -; AVX-64-NEXT: movzbl %sil, %esi +; AVX-64-NEXT: movzbl %dl, %ecx +; AVX-64-NEXT: movzbl %sil, %edx ; AVX-64-NEXT: vmovd %edi, %xmm0 ; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: movzbl %dl, %ecx ; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-64-NEXT: movl %esi, %edx +; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx ; AVX-64-NEXT: retq %2 = zext <3 x i8> %0 to <3 x i16> diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1794,58 +1794,58 @@ ; SSE2OR3-LABEL: psubus_16i32_max: ; SSE2OR3: # %bb.0: # %vector.ph ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm5, %xmm8 ; SSE2OR3-NEXT: pxor %xmm9, %xmm8 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm6 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm6, %xmm5 +; SSE2OR3-NEXT: pxor %xmm8, %xmm6 +; SSE2OR3-NEXT: por %xmm5, %xmm6 +; SSE2OR3-NEXT: pslld $16, %xmm6 +; SSE2OR3-NEXT: psrad $16, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm10 +; SSE2OR3-NEXT: pxor %xmm9, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm4 +; SSE2OR3-NEXT: pxor %xmm8, %xmm5 +; SSE2OR3-NEXT: por %xmm4, %xmm5 +; SSE2OR3-NEXT: pslld $16, %xmm5 +; SSE2OR3-NEXT: psrad $16, %xmm5 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 +; SSE2OR3-NEXT: pxor %xmm9, %xmm4 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2OR3-NEXT: pand %xmm6, %xmm3 ; SSE2OR3-NEXT: pxor %xmm8, %xmm6 ; SSE2OR3-NEXT: por %xmm3, %xmm6 ; SSE2OR3-NEXT: pslld $16, %xmm6 ; SSE2OR3-NEXT: psrad $16, %xmm6 -; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm9, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2OR3-NEXT: pand %xmm7, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm7 -; SSE2OR3-NEXT: por %xmm2, %xmm7 +; SSE2OR3-NEXT: pxor %xmm2, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2OR3-NEXT: pxor %xmm7, %xmm8 +; SSE2OR3-NEXT: pand %xmm2, %xmm7 +; SSE2OR3-NEXT: por %xmm8, %xmm7 ; SSE2OR3-NEXT: pslld $16, %xmm7 ; SSE2OR3-NEXT: psrad $16, %xmm7 ; SSE2OR3-NEXT: packssdw %xmm6, %xmm7 ; SSE2OR3-NEXT: psubusw %xmm7, %xmm0 -; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm9, %xmm2 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm3 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2OR3-NEXT: pand %xmm3, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: por %xmm5, %xmm3 -; SSE2OR3-NEXT: pslld $16, %xmm3 -; SSE2OR3-NEXT: psrad $16, %xmm3 -; SSE2OR3-NEXT: pxor %xmm4, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pxor %xmm10, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm10 -; SSE2OR3-NEXT: por %xmm8, %xmm10 -; SSE2OR3-NEXT: pslld $16, %xmm10 -; SSE2OR3-NEXT: psrad $16, %xmm10 -; SSE2OR3-NEXT: packssdw %xmm3, %xmm10 -; SSE2OR3-NEXT: psubusw %xmm10, %xmm1 +; SSE2OR3-NEXT: psubusw %xmm5, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm6, %xmm5 +; SSE41-NEXT: pminud %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psubusw %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: psubusw %xmm4, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -142,9 +142,9 @@ define i16 @test_i16_shl_lshr_2(i16 %a0) { ; X86-LABEL: test_i16_shl_lshr_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $2, %eax -; X86-NEXT: andl $16376, %eax # imm = 0x3FF8 +; X86-NEXT: andl $-8, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,28 +4,44 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl 6(%rdi), %r8d -; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movq (%rsi), %rsi -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: pextrw $1, %xmm0, %r9d -; CHECK-NEXT: movd %xmm0, %r10d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movq (%rsi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 +; CHECK-NEXT: pextrw $1, %xmm1, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrw $1, %xmm0, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm0, %edi -; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 4(%rdx) -; CHECK-NEXT: movw %r8w, 12(%rdx) -; CHECK-NEXT: movw %si, (%rdx) -; CHECK-NEXT: movw %di, 10(%rdx) -; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r9w, 6(%rdx) +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrw $2, %xmm0, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrw $3, %xmm1, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrw $2, %xmm1, %eax +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,65535,0,65535] +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,0] +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = mem[0,0,2,1,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -196,53 +196,37 @@ } define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { -; SLM-LABEL: test_mul_v16i32_v16i8: -; SLM: # %bb.0: -; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM-NEXT: movdqa {{.*#+}} xmm5 = <18778,u,18778,u,18778,u,18778,u> -; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SLM-NEXT: pmaddwd %xmm5, %xmm0 -; SLM-NEXT: pmaddwd %xmm5, %xmm1 -; SLM-NEXT: pmaddwd %xmm5, %xmm2 -; SLM-NEXT: pmaddwd %xmm5, %xmm3 -; SLM-NEXT: ret{{[l|q]}} -; -; SLOW-LABEL: test_mul_v16i32_v16i8: -; SLOW: # %bb.0: -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm4 = <18778,u,18778,u,18778,u,18778,u> -; SLOW-NEXT: pmaddwd %xmm4, %xmm0 -; SLOW-NEXT: pmaddwd %xmm4, %xmm1 -; SLOW-NEXT: pmaddwd %xmm4, %xmm2 -; SLOW-NEXT: pmaddwd %xmm4, %xmm3 -; SLOW-NEXT: ret{{[l|q]}} +; SSE4-32-LABEL: test_mul_v16i32_v16i8: +; SSE4-32: # %bb.0: +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; SSE4-32-NEXT: retl ; -; SSE4-LABEL: test_mul_v16i32_v16i8: -; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <18778,u,18778,u,18778,u,18778,u> -; SSE4-NEXT: pmaddwd %xmm4, %xmm0 -; SSE4-NEXT: pmaddwd %xmm4, %xmm1 -; SSE4-NEXT: pmaddwd %xmm4, %xmm2 -; SSE4-NEXT: pmaddwd %xmm4, %xmm3 -; SSE4-NEXT: ret{{[l|q]}} +; SSE4-64-LABEL: test_mul_v16i32_v16i8: +; SSE4-64: # %bb.0: +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE4-64-NEXT: retq ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8: ; AVX2-SLOW: # %bb.0: @@ -720,53 +704,37 @@ } define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { -; SLM-LABEL: test_mul_v16i32_v16i8_minsize: -; SLM: # %bb.0: -; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM-NEXT: movdqa {{.*#+}} xmm5 = <18778,u,18778,u,18778,u,18778,u> -; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SLM-NEXT: pmaddwd %xmm5, %xmm0 -; SLM-NEXT: pmaddwd %xmm5, %xmm1 -; SLM-NEXT: pmaddwd %xmm5, %xmm2 -; SLM-NEXT: pmaddwd %xmm5, %xmm3 -; SLM-NEXT: ret{{[l|q]}} -; -; SLOW-LABEL: test_mul_v16i32_v16i8_minsize: -; SLOW: # %bb.0: -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm4 = <18778,u,18778,u,18778,u,18778,u> -; SLOW-NEXT: pmaddwd %xmm4, %xmm0 -; SLOW-NEXT: pmaddwd %xmm4, %xmm1 -; SLOW-NEXT: pmaddwd %xmm4, %xmm2 -; SLOW-NEXT: pmaddwd %xmm4, %xmm3 -; SLOW-NEXT: ret{{[l|q]}} +; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize: +; SSE4-32: # %bb.0: +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; SSE4-32-NEXT: retl ; -; SSE4-LABEL: test_mul_v16i32_v16i8_minsize: -; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <18778,u,18778,u,18778,u,18778,u> -; SSE4-NEXT: pmaddwd %xmm4, %xmm0 -; SSE4-NEXT: pmaddwd %xmm4, %xmm1 -; SSE4-NEXT: pmaddwd %xmm4, %xmm2 -; SSE4-NEXT: pmaddwd %xmm4, %xmm3 -; SSE4-NEXT: ret{{[l|q]}} +; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize: +; SSE4-64: # %bb.0: +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE4-64-NEXT: retq ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize: ; AVX2-SLOW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -103,6 +103,8 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovaps (%edx), %xmm0 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -117,6 +119,8 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1158,32 +1158,47 @@ } define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { -; SSE-LABEL: i32_shuf_W00W: -; SSE: ## %bb.0: -; SSE-NEXT: pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff] -; SSE-NEXT: ## xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] -; SSE-NEXT: pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3] -; SSE-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-SSE-LABEL: i32_shuf_W00W: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] +; X86-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-SSE-NEXT: retl ## encoding: [0xc3] ; -; AVX1-LABEL: i32_shuf_W00W: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] -; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX1-LABEL: i32_shuf_W00W: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X86-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; -; AVX512-LABEL: i32_shuf_W00W: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] -; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX512-LABEL: i32_shuf_W00W: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: vpshufb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X86-AVX512-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: i32_shuf_W00W: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] +; X64-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: i32_shuf_W00W: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X64-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: i32_shuf_W00W: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[12,13,14,15] +; X64-AVX512-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: retq ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 3 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -1195,34 +1210,20 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_X00A: ; SSE: ## %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2] -; SSE-NEXT: pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc] -; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE-NEXT: pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00] -; SSE-NEXT: ## xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; SSE-NEXT: insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36] +; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_X00A: ; AVX1: ## %bb.0: -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX1-NEXT: vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00] -; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] -; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] +; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_X00A: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] -; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] +; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1235,28 +1236,20 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_X00X: ; SSE: ## %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] -; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00] -; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c] -; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE-NEXT: insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36] +; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_X00X: ; AVX1: ## %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] -; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] +; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_X00X: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] +; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1269,32 +1262,26 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_X0YC: ; SSE: ## %bb.0: -; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0] -; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa] -; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f] -; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; SSE-NEXT: pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0] +; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0] +; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_X0YC: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa] -; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2] -; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0] -; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] +; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_X0YC: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa] -; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2] -; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] -; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] +; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1595,8 +1582,9 @@ ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] +; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -1615,8 +1603,9 @@ ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] +; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -2125,14 +2114,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2153,14 +2142,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -670,8 +670,10 @@ ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -687,8 +689,10 @@ ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -2717,8 +2717,10 @@ ; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -2734,8 +2736,10 @@ ; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -675,8 +675,10 @@ ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -692,8 +694,10 @@ ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -755,8 +755,10 @@ ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -772,8 +774,10 @@ ; SSSE3-NEXT: pxor %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2390,8 +2390,10 @@ ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -2407,8 +2409,10 @@ ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -798,8 +798,10 @@ ; SSE2-NEXT: psubw %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -816,8 +818,10 @@ ; SSSE3-NEXT: psubw %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -99,19 +99,19 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-LABEL: test_bitreverse_i16: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: rolw $8, %di -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; SSE-NEXT: shll $4, %ecx +; SSE-NEXT: shrl $4, %eax ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F -; SSE-NEXT: shll $4, %eax -; SSE-NEXT: shrl $4, %edi -; SSE-NEXT: andl $3855, %edi # imm = 0xF0F -; SSE-NEXT: orl %eax, %edi -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; SSE-NEXT: shrl $2, %eax ; SSE-NEXT: andl $13107, %eax # imm = 0x3333 -; SSE-NEXT: shrl $2, %edi -; SSE-NEXT: andl $13107, %edi # imm = 0x3333 -; SSE-NEXT: leal (%rdi,%rax,4), %eax +; SSE-NEXT: leal (%rax,%rcx,4), %eax ; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; SSE-NEXT: shrl %eax @@ -122,19 +122,19 @@ ; ; AVX-LABEL: test_bitreverse_i16: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: rolw $8, %di -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; AVX-NEXT: shll $4, %ecx +; AVX-NEXT: shrl $4, %eax ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F -; AVX-NEXT: shll $4, %eax -; AVX-NEXT: shrl $4, %edi -; AVX-NEXT: andl $3855, %edi # imm = 0xF0F -; AVX-NEXT: orl %eax, %edi -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: orl %ecx, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; AVX-NEXT: shrl $2, %eax ; AVX-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NEXT: shrl $2, %edi -; AVX-NEXT: andl $13107, %edi # imm = 0x3333 -; AVX-NEXT: leal (%rdi,%rax,4), %eax +; AVX-NEXT: leal (%rax,%rcx,4), %eax ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; AVX-NEXT: shrl %eax @@ -153,19 +153,19 @@ ; ; GFNISSE-LABEL: test_bitreverse_i16: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi ; GFNISSE-NEXT: rolw $8, %di -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: movzwl %di, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNISSE-NEXT: shll $4, %ecx +; GFNISSE-NEXT: shrl $4, %eax ; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNISSE-NEXT: shll $4, %eax -; GFNISSE-NEXT: shrl $4, %edi -; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNISSE-NEXT: orl %eax, %edi -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: orl %ecx, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNISSE-NEXT: shrl $2, %eax ; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNISSE-NEXT: shrl $2, %edi -; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax +; GFNISSE-NEXT: leal (%rax,%rcx,4), %eax ; GFNISSE-NEXT: movl %eax, %ecx ; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNISSE-NEXT: shrl %eax @@ -176,19 +176,19 @@ ; ; GFNIAVX-LABEL: test_bitreverse_i16: ; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX-NEXT: rolw $8, %di -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: movzwl %di, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNIAVX-NEXT: shll $4, %ecx +; GFNIAVX-NEXT: shrl $4, %eax ; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNIAVX-NEXT: shll $4, %eax -; GFNIAVX-NEXT: shrl $4, %edi -; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNIAVX-NEXT: orl %eax, %edi -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: orl %ecx, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNIAVX-NEXT: shrl $2, %eax ; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX-NEXT: shrl $2, %edi -; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX-NEXT: leal (%rax,%rcx,4), %eax ; GFNIAVX-NEXT: movl %eax, %ecx ; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNIAVX-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -999,7 +999,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1010,7 +1010,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1021,7 +1021,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1032,7 +1032,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1052,7 +1052,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1087,7 +1087,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -580,7 +580,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 @@ -597,7 +597,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -767,7 +767,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -778,7 +778,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -789,7 +789,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -800,7 +800,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -811,7 +811,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -330,7 +330,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -341,7 +341,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1032,7 +1032,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1043,7 +1043,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1054,7 +1054,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1065,7 +1065,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1085,7 +1085,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1121,7 +1121,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -582,7 +582,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -600,7 +600,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -801,7 +801,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -812,7 +812,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -823,7 +823,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -834,7 +834,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -845,7 +845,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -328,7 +328,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -339,7 +339,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -651,100 +651,102 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %ymm13, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -835,84 +837,86 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -920,7 +924,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1442,7 +1446,7 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: subq $216, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1451,12 +1455,16 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -1471,12 +1479,18 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1484,55 +1498,51 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -1544,67 +1554,67 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -1617,10 +1627,10 @@ ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1634,19 +1644,19 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1667,12 +1677,12 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $200, %rsp +; AVX2-SLOW-NEXT: addq $216, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1843,7 +1853,7 @@ ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1852,12 +1862,18 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -1872,12 +1888,18 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1885,158 +1907,152 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] @@ -2052,12 +2068,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1322,32 +1322,33 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.vec1, <32 x i16>* %out.vec2, <32 x i16>* %out.vec3, <32 x i16>* %out.vec4) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa 240(%rdi), %xmm8 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: movdqa 288(%rdi), %xmm14 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movdqa 304(%rdi), %xmm10 +; SSE-NEXT: movdqa 240(%rdi), %xmm12 +; SSE-NEXT: movdqa 256(%rdi), %xmm9 +; SSE-NEXT: movdqa 288(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1360,40 +1361,44 @@ ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm15 ; SSE-NEXT: andps %xmm7, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1417,9 +1422,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] @@ -1431,17 +1436,17 @@ ; SSE-NEXT: andps %xmm7, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1449,26 +1454,23 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 ; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1476,24 +1478,24 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1501,8 +1503,8 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -1512,11 +1514,12 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1527,111 +1530,112 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0,1,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm8, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: andnps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm9[3,0] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: andnps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] @@ -1639,143 +1643,145 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm12[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm13[3,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[0,2] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,0] +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0] -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1800,16 +1806,16 @@ ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%r8) +; SSE-NEXT: movaps %xmm13, 32(%r8) ; SSE-NEXT: movaps %xmm15, (%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) -; SSE-NEXT: movaps %xmm12, 32(%r9) +; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movaps %xmm6, 16(%r9) +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1638,48 +1638,46 @@ ; SSE-NEXT: movdqa 224(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: pslld $16, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1687,177 +1685,178 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa 304(%rdi), %xmm15 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa 352(%rdi), %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm12 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm11[2,3] ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 ; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,0] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm13[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -1865,157 +1864,157 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm14[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm14[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $132, (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm10[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: psrlq $48, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 @@ -2024,48 +2023,47 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: andps %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2082,10 +2080,10 @@ ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2095,69 +2093,69 @@ ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2170,12 +2168,13 @@ ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2186,34 +2185,35 @@ ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm14, 48(%r8) -; SSE-NEXT: movdqa %xmm11, 16(%r8) +; SSE-NEXT: movdqa %xmm10, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm3, 48(%r9) -; SSE-NEXT: movdqa %xmm4, 16(%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) -; SSE-NEXT: movdqa %xmm9, (%r9) +; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) +; SSE-NEXT: movdqa %xmm7, (%r9) +; SSE-NEXT: movdqa %xmm9, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm7, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 32(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm11, 48(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm14, 32(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -107,23 +107,23 @@ ; ; AVX1-LABEL: load_i32_stride4_vf4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm3[1],xmm4[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm0[2],xmm1[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; AVX1-NEXT: vmovaps %xmm4, (%rsi) ; AVX1-NEXT: vmovaps %xmm5, (%rdx) ; AVX1-NEXT: vmovaps %xmm6, (%rcx) ; AVX1-NEXT: vmovaps %xmm0, (%r8) @@ -247,33 +247,33 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm7[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm8[2,3],ymm9[6,4],ymm8[6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[1],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = xmm5[1],xmm6[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm6[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm7[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm3[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm6[3,0],xmm5[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovaps %ymm11, (%rsi) @@ -479,115 +479,110 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: subq $312, %rsp # imm = 0x138 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX1-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5] -; AVX1-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX1-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX1-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm12[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[4],ymm12[4],ymm7[5],ymm12[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 176(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm1[0] +; AVX1-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm1, %xmm14 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm15[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[4],ymm15[4],ymm11[5],ymm15[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm7[1,0],ymm4[5,4],ymm7[5,4] -; AVX1-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm7[1,0],ymm12[5,4],ymm7[5,4] +; AVX1-NEXT: vmovaps %ymm7, %ymm9 ; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-NEXT: vmovaps %ymm5, %ymm2 +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[1,0],ymm5[5,4],ymm1[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,0],ymm1[1,0],ymm2[5,4],ymm1[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm12[1],xmm6[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[1],xmm6[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[6],ymm12[6],ymm9[7],ymm12[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,0],ymm6[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX1-NEXT: vmovaps %xmm9, %xmm10 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-NEXT: vmovaps %xmm8, %xmm10 ; AVX1-NEXT: vmovaps %xmm7, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm8[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX1-NEXT: vmovaps %ymm2, %ymm9 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm5[2] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm15[2],xmm14[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-NEXT: # ymm2 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm11[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm9[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-NEXT: # ymm6 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,0],ymm1[3,0],ymm5[7,4],ymm1[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,0],xmm12[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm8[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) @@ -1054,11 +1049,10 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX1-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1073,24 +1067,23 @@ ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX1-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-NEXT: vmovaps %ymm3, %ymm15 ; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-NEXT: vmovaps %ymm1, %ymm13 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] +; AVX1-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm0, %ymm15 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 128(%rdi), %xmm8 @@ -1111,15 +1104,15 @@ ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1127,7 +1120,8 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[4],ymm10[4],ymm6[5],ymm10[5] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm6[1,0],ymm5[5,4],ymm6[5,4] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[1,0],ymm1[5,4],ymm6[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -1138,12 +1132,12 @@ ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm9[1,0],ymm15[5,4],ymm9[5,4] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm12[1,0],ymm9[5,4],ymm12[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vunpcklps (%rsp), %xmm2, %xmm7 # 16-byte Folded Reload ; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1172,12 +1166,12 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[0],xmm15[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload @@ -1202,8 +1196,8 @@ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm5[2],xmm6[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1225,11 +1219,11 @@ ; AVX1-NEXT: # ymm1 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm9[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1275,8 +1269,8 @@ ; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm8[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -298,13 +298,15 @@ ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-NEXT: vpextrd $2, %xmm5, %eax ; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm8 +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm6, %xmm6 ; AVX512-NEXT: vpextrd $1, %xmm3, %eax -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %eax, %xmm6, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm7 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrd $1, %eax, %xmm7, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512-NEXT: vmovd %xmm4, %eax @@ -321,9 +323,10 @@ ; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 ; AVX512-NEXT: vpextrd $2, %xmm4, %eax ; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpextrd $1, %xmm5, %eax -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] ; AVX512-NEXT: vmovdqa %xmm8, (%rsi) @@ -879,32 +882,29 @@ define void @load_i32_stride6_vf16(<96 x i32>* %in.vec, <16 x i32>* %out.vec0, <16 x i32>* %out.vec1, <16 x i32>* %out.vec2, <16 x i32>* %out.vec3, <16 x i32>* %out.vec4, <16 x i32>* %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $376, %rsp # imm = 0x178 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 240(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 256(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] @@ -912,30 +912,28 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm9 -; SSE-NEXT: movdqa 304(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 336(%rdi), %xmm12 @@ -946,18 +944,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] @@ -971,155 +967,152 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: movdqa 272(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm11 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1127,47 +1120,46 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movapd %xmm10, 16(%r8) ; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm11, 32(%r8) +; SSE-NEXT: movapd %xmm15, (%r8) ; SSE-NEXT: movapd %xmm3, 48(%r9) ; SSE-NEXT: movapd %xmm5, 16(%r9) -; SSE-NEXT: movapd %xmm6, 32(%r9) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movapd %xmm6, (%r9) +; SSE-NEXT: movapd %xmm9, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) -; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm11, 48(%rax) -; SSE-NEXT: addq $376, %rsp # imm = 0x178 +; SSE-NEXT: movapd %xmm13, 16(%rax) +; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride6_vf16: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -282,16 +282,16 @@ ; AVX1-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm4[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm3[1],xmm0[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm6[1] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 @@ -309,13 +309,13 @@ ; AVX1-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps %xmm8, (%rdx) +; AVX1-NEXT: vmovaps %xmm4, (%rdx) +; AVX1-NEXT: vmovaps %xmm8, 32(%rdx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX1-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-NEXT: vmovaps %ymm5, (%r8) ; AVX1-NEXT: vzeroupper @@ -587,143 +587,143 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-NEXT: vmovaps 224(%rdi), %xmm8 -; AVX1-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0] +; AVX1-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm8[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] +; AVX1-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX1-NEXT: vmovaps 352(%rdi), %xmm9 ; AVX1-NEXT: vmovaps 320(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm4[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm12[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 288(%rdi), %xmm6 -; AVX1-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm6[0] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm9[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-NEXT: vmovaps (%rdi), %xmm7 ; AVX1-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1] +; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 448(%rdi), %xmm4 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm13[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm9[1] +; AVX1-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm8[1] +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX1-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 144(%rdi), %xmm10 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm11[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-NEXT: vmovaps 304(%rdi), %xmm15 ; AVX1-NEXT: vmovaps 272(%rdi), %xmm14 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm15[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm15[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-NEXT: vmovaps 176(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 144(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm10[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX1-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm15[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-NEXT: vmovaps %ymm9, 96(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm14[1],xmm15[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-NEXT: vmovaps %ymm0, (%r8) -; AVX1-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-NEXT: vmovaps %ymm4, (%r8) +; AVX1-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -731,97 +731,97 @@ ; AVX2-LABEL: load_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX2-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm11[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm10[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm7[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm8[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm3[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm8[1] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm0[0] ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-NEXT: vmovaps 352(%rdi), %xmm7 +; AVX2-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm7[0] ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm7[1] +; AVX2-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-NEXT: vmovaps 480(%rdi), %xmm12 +; AVX2-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm12[0] ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm12[1] ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm13[0] +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm10[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm0[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm0[0] ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm0[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm11[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm15[2,3] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm12[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm12[2,3] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[2,3] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm0[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] @@ -830,7 +830,7 @@ ; AVX2-NEXT: vmovaps %xmm3, 112(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 64(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rsi) @@ -843,32 +843,32 @@ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 48(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 16(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX2-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %ymm10, 96(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-NEXT: vmovaps %ymm0, (%r8) -; AVX2-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-NEXT: vmovaps %ymm15, 32(%r8) ; AVX2-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -298,12 +298,12 @@ ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm8 ; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps (%rdi), %xmm11 ; SSE-NEXT: movaps 16(%rdi), %xmm10 @@ -336,34 +336,34 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -383,11 +383,11 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) ; SSE-NEXT: movaps %xmm11, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm7, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -395,14 +395,14 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps %xmm5, 16(%r9) -; SSE-NEXT: movaps %xmm6, 32(%r9) +; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm6, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -417,13 +417,13 @@ ; ; AVX1-LABEL: store_i64_stride6_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd (%rdi), %ymm15 -; AVX1-NEXT: vmovapd 32(%rdi), %ymm12 -; AVX1-NEXT: vmovapd (%rsi), %ymm9 +; AVX1-NEXT: vmovapd (%rdi), %ymm6 +; AVX1-NEXT: vmovapd 32(%rdi), %ymm10 +; AVX1-NEXT: vmovapd (%rsi), %ymm7 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm13 -; AVX1-NEXT: vmovapd (%r8), %ymm10 +; AVX1-NEXT: vmovapd (%r8), %ymm8 ; AVX1-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-NEXT: vmovapd 32(%r9), %ymm2 +; AVX1-NEXT: vmovapd 32(%r9), %ymm12 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] @@ -431,76 +431,78 @@ ; AVX1-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-NEXT: vmovaps (%rsi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0],ymm7[1],ymm1[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm13[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm7[0],ymm1[0],ymm7[2],ymm1[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX1-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm0[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0],ymm9[1],ymm3[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm13[2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm3[0],ymm10[2],ymm3[3] ; AVX1-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm7[2,3],ymm1[4,5],ymm7[6,7] -; AVX1-NEXT: vmovapd (%r9), %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm9[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX1-NEXT: vmovaps (%rcx), %xmm10 -; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm15 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] -; AVX1-NEXT: vmovapd 48(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] -; AVX1-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX1-NEXT: vmovapd (%r9), %ymm3 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX1-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] +; AVX1-NEXT: vmovapd 48(%rdx), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3] +; AVX1-NEXT: vmovapd 16(%rdx), %xmm6 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] +; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm11 +; AVX1-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-NEXT: vmovaps 32(%rdx), %xmm5 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %xmm4, 16(%rax) -; AVX1-NEXT: vmovaps %xmm3, (%rax) +; AVX1-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-NEXT: vmovaps %xmm0, (%rax) ; AVX1-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-NEXT: vmovapd %ymm7, 128(%rax) +; AVX1-NEXT: vmovaps %xmm4, 192(%rax) +; AVX1-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-NEXT: vmovapd %ymm9, 128(%rax) ; AVX1-NEXT: vmovaps %ymm13, 256(%rax) -; AVX1-NEXT: vmovapd %ymm12, 320(%rax) -; AVX1-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-NEXT: vmovapd %ymm10, 320(%rax) +; AVX1-NEXT: vmovapd %ymm15, 32(%rax) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-NEXT: vmovapd %ymm3, 160(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -572,7 +572,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -588,7 +588,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -604,7 +604,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al @@ -643,7 +643,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al @@ -660,7 +660,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -674,11 +674,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al @@ -719,11 +719,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -738,11 +738,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -757,11 +757,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -766,7 +766,7 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -777,7 +777,7 @@ ; ; AVX512F-LABEL: splatvar_rotate_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -788,7 +788,7 @@ ; ; AVX512VL-LABEL: splatvar_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -799,7 +799,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -810,7 +810,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -343,7 +343,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -354,7 +354,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3657,37 +3657,27 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: broadcast_concat_crash: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,1,1] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: broadcast_concat_crash: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: broadcast_concat_crash: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] -; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: broadcast_concat_crash: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: vbroadcastss %xmm2, %ymm2 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [3,13,1,1,3,13,1,1] +; AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: retq entry: %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> %bc = bitcast <8 x float> %tmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -364,8 +364,8 @@ ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AMD10H-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2571,10 +2571,9 @@ ; ; SSE41-LABEL: splatshuf_zext_v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v4i64: @@ -2711,11 +2710,8 @@ ; ; SSE41-LABEL: splatshuf_zext_v16i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v16i16: diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -283,6 +283,8 @@ ; SSE-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: psllw $15, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -298,6 +300,8 @@ ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,0,0] ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -409,8 +409,8 @@ ; ; X64-WIN-LABEL: PR17487: ; X64-WIN: # %bb.0: +; X64-WIN-NEXT: andb $1, %cl ; X64-WIN-NEXT: movzbl %cl, %eax -; X64-WIN-NEXT: andl $1, %eax ; X64-WIN-NEXT: retq %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 %tmp1 = zext <2 x i1> %tmp to <2 x i64>